library("tidyverse")
── Attaching core tidyverse packages ──────────────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.0     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library("data.table")
Registered S3 method overwritten by 'data.table':
  method           from
  print.data.table     
data.table 1.14.10 using 1 threads (see ?getDTthreads).  Latest news: r-datatable.com
**********
This installation of data.table has not detected OpenMP support. It should still work but in single-threaded mode.
This is a Mac. Please read https://mac.r-project.org/openmp/. Please engage with Apple and ask them for support. Check r-datatable.com for updates, and our Mac instructions here: https://github.com/Rdatatable/data.table/wiki/Installation. After several years of many reports of installation problems on Mac, it's time to gingerly point out that there have been no similar problems on Windows or Linux.
**********

Attaching package: ‘data.table’

The following objects are masked from ‘package:lubridate’:

    hour, isoweek, mday, minute, month, quarter, second, wday, week, yday,
    year

The following objects are masked from ‘package:dplyr’:

    between, first, last

The following object is masked from ‘package:purrr’:

    transpose
library("rtracklayer")
Loading required package: GenomicRanges
Warning: package ‘GenomicRanges’ was built under R version 4.2.2
Loading required package: stats4
Loading required package: BiocGenerics

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:lubridate’:

    intersect, setdiff, union

The following objects are masked from ‘package:dplyr’:

    combine, intersect, setdiff, union

The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs

The following objects are masked from ‘package:base’:

    anyDuplicated, aperm, append, as.data.frame, basename, cbind, colnames,
    dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep, grepl,
    intersect, is.unsorted, lapply, Map, mapply, match, mget, order, paste,
    pmax, pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce, rownames,
    sapply, setdiff, sort, table, tapply, union, unique, unsplit, which.max,
    which.min

Loading required package: S4Vectors
Warning: package ‘S4Vectors’ was built under R version 4.2.2

Attaching package: ‘S4Vectors’

The following objects are masked from ‘package:data.table’:

    first, second

The following objects are masked from ‘package:lubridate’:

    second, second<-

The following objects are masked from ‘package:dplyr’:

    first, rename

The following object is masked from ‘package:tidyr’:

    expand

The following objects are masked from ‘package:base’:

    expand.grid, I, unname

Loading required package: IRanges

Attaching package: ‘IRanges’

The following object is masked from ‘package:data.table’:

    shift

The following object is masked from ‘package:lubridate’:

    %within%

The following objects are masked from ‘package:dplyr’:

    collapse, desc, slice

The following object is masked from ‘package:purrr’:

    reduce

Loading required package: GenomeInfoDb
Warning: package ‘GenomeInfoDb’ was built under R version 4.2.2
library("ggrastr")
library("glue")

Attaching package: ‘glue’

The following object is masked from ‘package:GenomicRanges’:

    trim

The following object is masked from ‘package:IRanges’:

    trim
library("DESeq2")
Loading required package: SummarizedExperiment
Loading required package: MatrixGenerics
Loading required package: matrixStats

Attaching package: ‘matrixStats’

The following object is masked from ‘package:dplyr’:

    count


Attaching package: ‘MatrixGenerics’

The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse, colCounts,
    colCummaxs, colCummins, colCumprods, colCumsums, colDiffs, colIQRDiffs,
    colIQRs, colLogSumExps, colMadDiffs, colMads, colMaxs, colMeans2,
    colMedians, colMins, colOrderStats, colProds, colQuantiles, colRanges,
    colRanks, colSdDiffs, colSds, colSums2, colTabulates, colVarDiffs,
    colVars, colWeightedMads, colWeightedMeans, colWeightedMedians,
    colWeightedSds, colWeightedVars, rowAlls, rowAnyNAs, rowAnys,
    rowAvgsPerColSet, rowCollapse, rowCounts, rowCummaxs, rowCummins,
    rowCumprods, rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
    rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
    rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks, rowSdDiffs,
    rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars, rowWeightedMads,
    rowWeightedMeans, rowWeightedMedians, rowWeightedSds, rowWeightedVars

Loading required package: Biobase
Welcome to Bioconductor

    Vignettes contain introductory material; view with 'browseVignettes()'. To
    cite Bioconductor, see 'citation("Biobase")', and for packages
    'citation("pkgname")'.


Attaching package: ‘Biobase’

The following object is masked from ‘package:MatrixGenerics’:

    rowMedians

The following objects are masked from ‘package:matrixStats’:

    anyMissing, rowMedians
library("ggpubr")
library("wigglescout")
library("eulerr")
library("ggplot2")
library(corrplot)
corrplot 0.92 loaded
# export 
result_folder = "../results/"
plot_folder = "../results/plots/"
stat_output = "../results/stats/"

combined_bigwigs <- list.files("../mendeley/Figure2","CTCF_ChIP-seq.+bw",
                  full.names = TRUE)
geo_bigwigs <- list.files("../mendeley/Figure2","GSM.+bw",
                  full.names = TRUE)
# colors
mypal <-c("cornflowerblue","orange","red2","darkgreen","#505050")
mypal <-c("#00619D","#A82A34","orange","seagreen","#505050")
mypal2 <-c("#00619D","#7FB0CE","#A82A34","#D39499","orange","#FFD55A","seagreen","#7DBA9C","#505050")
# diff signal function
bw_granges_diff_analysis <- function(granges_c1,
                                     granges_c2,
                                     label_c1,
                                     label_c2,
                                     estimate_size_factors = FALSE,
                                     as_granges = FALSE) {
  # Bind first, get numbers after
  names_values <- NULL
  fields <- names(mcols(granges_c1))
  
  if ("name" %in% fields) {
    names_values <- mcols(granges_c1)[["name"]]
    granges_c1 <- granges_c1[, fields[fields != "name"]]
  }
  
  fields <- names(mcols(granges_c2))
  if ("name" %in% fields) {
    granges_c2 <- granges_c2[, fields[fields != "name"]]
  }
  
  cts_df <- cbind(data.frame(granges_c1), mcols(granges_c2))
  
  if (!is.null(names_values)) {
    rownames(cts_df) <- names_values
  }
  
  # Needs to drop non-complete cases and match rows
  complete <- complete.cases(cts_df)
  cts_df <- cts_df[complete, ]
  
  values_df <- cts_df[, 6:ncol(cts_df)] %>% dplyr::select(where(is.numeric))
  cts <- get_nreads_columns(values_df)
  
  condition_labels <- c(rep(label_c1, length(mcols(granges_c1))), rep(label_c2, length(mcols(granges_c2))))
  
  
  coldata <- data.frame(colnames(cts), condition = as.factor(condition_labels))
  print(coldata)
  
  dds <- DESeq2::DESeqDataSetFromMatrix(
    countData = cts,
    colData = coldata,
    design = ~ condition,
    rowRanges = granges_c1[complete, ]
  )
  
  
  if (estimate_size_factors == TRUE) {
    dds <- DESeq2::estimateSizeFactors(dds)
  }
  else {
    # Since files are scaled, we do not want to estimate size factors
    sizeFactors(dds) <- c(rep(1, ncol(cts)))
  }
  
  dds <- DESeq2::estimateDispersions(dds)
  dds <- DESeq2::nbinomWaldTest(dds)
  
  if (as_granges) {
    result <- DESeq2::results(dds, format = "GRanges", alpha = 0.01)
    if (!is.null(names_values)) {
      result$name <- names_values[complete]
    }
    
  }
  else {
    # result <- results(dds, format="DataFrame")
    result <- dds
  }
  
  result
}

get_nreads_columns <- function(df, length_factor = 100) {
  # Convert mean coverages to round integer read numbers
  cts <- as.matrix(df)
  cts <- as.matrix(cts[complete.cases(cts), ])
  cts <- round(cts * length_factor)
  cts
}
ctcf.and.G4.pro <- import("../peaks/G4 CTCF_with_promoters_sorted.bed")
ctcf.not.G4.pro <- import("../peaks/CTCF-only_with_promoters_sorted.bed")
ctcf.and.G4.npr <- import("../peaks/G4 CTCF_without_promoters_sorted.bed")
ctcf.not.G4.npr <- import("../peaks/CTCF-only_without_promoters_sorted.bed")

ctcf.and.G4.pro$class <- "CTCF_and_G4"
ctcf.not.G4.pro$class <- "CTCF_not_G4"
ctcf.and.G4.npr$class <- "CTCF_and_G4"
ctcf.not.G4.npr$class <- "CTCF_not_G4"
ctcf.and.G4.pro$pro <- "Pro"
ctcf.not.G4.pro$pro <- "Pro"
ctcf.and.G4.npr$pro <- "noPro"
ctcf.not.G4.npr$pro <- "noPro"

ctcf.and.G4 <- c(ctcf.and.G4.pro,ctcf.and.G4.npr)
ctcf.not.G4 <- c(ctcf.not.G4.pro,ctcf.not.G4.npr)
ctcf <- c(ctcf.and.G4,ctcf.not.G4)
ctcf <- sortSeqlevels(ctcf)
ctcf <- sort(ctcf)
names(ctcf) <- paste0(ctcf$class," ",ctcf$pro)
peaks_bed <- "../peaks/CTCF_G4_in_6_categories.bed"
export.bed(ctcf, peaks_bed)

check MACS peak widths

ctcf.raw <- import("../peaks/CTCF_mES_peaks.narrowPeak")
G4.raw <- import("../peaks/G4_WT_peaks.narrowPeak")
ctcf.raw$class <- "CTCF"
G4.raw$class <- "G4"
ctcf.raw$pro <- "NA"
G4.raw$pro <- "NA"
df <- rbind(as.data.frame(ctcf.raw)[c(1,2,3,4,5,12,13)], as.data.frame(G4.raw)[c(1,2,3,4,5,12,13)],as.data.frame(ctcf,row.names = NULL))
ggdensity(df,x="width",y = "density",color = "class", fill="class", alpha=0.1, palette=mypal) + scale_x_continuous(limits = c(100,600))
Warning: Removed 3661 rows containing non-finite outside the scale range (`stat_density()`).

min(width(G4.raw))
[1] 233
min(width(ctcf.raw))
[1] 200
median(width(G4.raw))
[1] 316
median(width(ctcf.raw))
[1] 332

calculate coverage over peaks

# Wulfridge CTCF ChIP-Seq mocks
mocks_bigwigs = c("../data/CutNTag_ChIP-Seq/bw/SRR23958387_GSM7116277_E14_Mock_CTCF_Rep1_Mus_musculus_ChIP-Seq.CPMnorm.bw",
          "../data/CutNTag_ChIP-Seq/bw/SRR23958386_GSM7116278_E14_Mock_CTCF_Rep2_Mus_musculus_ChIP-Seq.CPMnorm.bw")

# PDS
# Wulfridge CTCF ChIP-Seq
pds_bigwigs = c("../data/CutNTag_ChIP-Seq/bw/SRR23958385_GSM7116279_E14_PDS_CTCF_Rep1_Mus_musculus_ChIP-Seq.CPMnorm.bw",
                "../data/CutNTag_ChIP-Seq/bw/SRR23958384_GSM7116280_E14_PDS_CTCF_Rep2_Mus_musculus_ChIP-Seq.CPMnorm.bw")

pdc_bigwigs = c("../data/CutNTag_ChIP-Seq/bw/SRR23958383_GSM7116281_E14_PhenDC3_CTCF_Rep1_Mus_musculus_ChIP-Seq.CPMnorm.bw",
                "../data/CutNTag_ChIP-Seq/bw/SRR23958382_GSM7116282_E14_PhenDC3_CTCF_Rep2_Mus_musculus_ChIP-Seq.CPMnorm.bw")

G4_bigwigs = c("../mendeley/Figure1/G4_CUT-Tag_thisStudy.bw")

cov.mocks <- bw_loci(mocks_bigwigs,peaks_bed,labels=c("mock_1","mock_2"))
cov.pds <- bw_loci(pds_bigwigs,peaks_bed,labels=c("PDS_1","PDS_2"))
cov.pdc <- bw_loci(pdc_bigwigs,peaks_bed,labels=c("PhenDC3_1","PhenDC3_2"))
cov.G4 <- bw_loci(G4_bigwigs,peaks_bed,labels=c("G4_NT"))
cov.combined <- bw_loci(combined_bigwigs,peaks_bed,labels=c("mock","PDS","PhenDC3"))
cov.geo <- bw_loci(geo_bigwigs,peaks_bed,labels=c("GEO_mock","GEO_PDS","GEO_PhenDC3"))

quick comparison on deposited and reprocessed bigwig files

corr_df <- data.frame(data.frame(cov.combined)[,c(9,6,7,8)],data.frame(cov.geo)[,6:8],data.frame(cov.mocks)[,6:7],data.frame(cov.pds)[,6:7],data.frame(cov.pdc)[,6:7])
res <- cor(corr_df[,2:13])
corrplot(res, type = "upper", order = "hclust",tl.col = "black", tl.srt = 45,addCoef.col = 'black',tl.pos = 'd',    cl.pos = 'n', col.lim=c(0.93, 1),is.corr = F, method = 'color')

p1 <- plot_bw_profile(c(mocks_bigwigs[1],pds_bigwigs[1],pdc_bigwigs[1]),loci = "../peaks/G4 CTCF_with_promoters_sorted.bed",mode = "center",verbose = F) + coord_cartesian(ylim=c(0,2.4))
p2 <- plot_bw_profile(c(mocks_bigwigs[2],pds_bigwigs[2],pdc_bigwigs[2]),loci = "../peaks/G4 CTCF_with_promoters_sorted.bed",mode = "center",verbose = F) + coord_cartesian(ylim=c(0,2.4))
p3 <- plot_bw_profile(bwfiles = combined_bigwigs,loci = "../peaks/G4 CTCF_with_promoters_sorted.bed",mode = "center",verbose = F) + coord_cartesian(ylim=c(0,2.4))
ggarrange(p1,p2,p3, ncol=3)

p1 <- plot_bw_profile(bwfiles = geo_bigwigs,loci = "../peaks/G4 CTCF_with_promoters_sorted.bed",mode = "center",verbose = F) + coord_cartesian(ylim=c(0,5))
p2 <- plot_bw_profile(bwfiles = combined_bigwigs,loci = "../peaks/G4 CTCF_with_promoters_sorted.bed",mode = "center",verbose = F) + coord_cartesian(ylim=c(0,2.6))
ggarrange(p1,p2, ncol=2)

p1 <- plot_bw_profile(bwfiles = geo_bigwigs,loci = "../peaks/CTCF-only_with_promoters_sorted.bed",mode = "center",verbose = F) + coord_cartesian(ylim=c(0,5))
Warning in .summarize_matrix(fg, label) :
  Profile plot: 324 generated ( 0.0538563829787234 per locus)
Warning in .summarize_matrix(fg, label) :
  Profile plot: 324 generated ( 0.0538563829787234 per locus)
Warning in .summarize_matrix(fg, label) :
  Profile plot: 324 generated ( 0.0538563829787234 per locus)
p2 <- plot_bw_profile(bwfiles = combined_bigwigs,loci = "../peaks/CTCF-only_with_promoters_sorted.bed",mode = "center",verbose = F) + coord_cartesian(ylim=c(0,2.6))
Warning in .summarize_matrix(fg, label) :
  Profile plot: 324 generated ( 0.0538563829787234 per locus)
Warning in .summarize_matrix(fg, label) :
  Profile plot: 324 generated ( 0.0538563829787234 per locus)
Warning in .summarize_matrix(fg, label) :
  Profile plot: 324 generated ( 0.0538563829787234 per locus)
ggarrange(p1,p2, ncol=2)

p1 <- plot_bw_loci_scatter(mocks_bigwigs[1],mocks_bigwigs[2], loci = "../peaks/G4 CTCF_with_promoters_sorted.bed", verbose = F)
p2 <- plot_bw_loci_scatter(pds_bigwigs[1],pds_bigwigs[2], loci = "../peaks/G4 CTCF_with_promoters_sorted.bed", verbose = F)
p3 <- plot_bw_loci_scatter(pdc_bigwigs[1],pdc_bigwigs[2], loci = "../peaks/G4 CTCF_with_promoters_sorted.bed", verbose = F)
ggarrange(p1,p2,p3, ncol=3)

p1 <- plot_bw_loci_scatter(combined_bigwigs[1],combined_bigwigs[2], loci = "../peaks/G4 CTCF_with_promoters_sorted.bed", verbose = F)
p2 <- plot_bw_loci_scatter(combined_bigwigs[2],combined_bigwigs[3], loci = "../peaks/G4 CTCF_with_promoters_sorted.bed", verbose = F)
p3 <- plot_bw_loci_scatter(combined_bigwigs[1],combined_bigwigs[3], loci = "../peaks/G4 CTCF_with_promoters_sorted.bed", verbose = F)
ggarrange(p1,p2,p3, ncol=3)

results <- data.frame(
  as.data.frame(cov.mocks),
  as.data.frame(cov.pds)[6:7],
  as.data.frame(cov.pdc)[6:7],
  as.data.frame(cov.G4)[6],
  raw.lfc.pds_1 = log2(cov.pds$PDS_1 / cov.mocks$mock_1),
  raw.lfc.pds_2 = log2(cov.pds$PDS_2 / cov.mocks$mock_2),
  raw.lfc.pdc_1 = log2(cov.pds$PDS_1 / cov.mocks$mock_1),
  raw.lfc.pdc_2 = log2(cov.pdc$PhenDC3_2 / cov.mocks$mock_2),
  raw.lfc.pds = log2(rowMeans(as.data.frame(cov.pds)[6:7]) / rowMeans(as.data.frame(cov.mocks)[6:7])),
  raw.lfc.pdc = log2(rowMeans(as.data.frame(cov.pdc)[6:7]) / rowMeans(as.data.frame(cov.mocks)[6:7])),
  mean.mock = rowMeans(as.data.frame(cov.mocks)[6:7]),
  mean.pds = rowMeans(as.data.frame(cov.pds)[6:7]),
  mean.pdc = rowMeans(as.data.frame(cov.pdc)[6:7])
)

cov.mocks$name <- NULL
cov.pds$name <- NULL
cov.pdc$name <- NULL

de_pds <- bw_granges_diff_analysis(cov.mocks, cov.pds, "Mock", "PDS")
converting counts to integer mode
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
lfc_pds = DESeq2::lfcShrink(de_pds, coef = "condition_PDS_vs_Mock", type = "apeglm")
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
    Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
    sequence count data: removing the noise and preserving large differences.
    Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
de_pdc <- bw_granges_diff_analysis(cov.mocks, cov.pdc, "Mock", "PhenDC3")
converting counts to integer mode
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
lfc_pdc = DESeq2::lfcShrink(de_pdc, coef = "condition_PhenDC3_vs_Mock", type = "apeglm")
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
    Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
    sequence count data: removing the noise and preserving large differences.
    Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
results$deseq.lfc.pds <- results(de_pds)$log2FoldChange
results$deseq.lfcs.pds <- lfc_pds$log2FoldChange
results$deseq.padj.pds <- lfc_pds$padj
results$deseq.mean.pds <- log2(lfc_pds$baseMean)
results$deseq.sig.pds <- lfc_pds$pvalue < 0.05

results$deseq.lfc.pdc <- results(de_pdc)$log2FoldChange
results$deseq.lfcs.pdc <- lfc_pdc$log2FoldChange
results$deseq.padj.pdc <- lfc_pdc$padj
results$deseq.mean.pdc <- log2(lfc_pdc$baseMean)
results$deseq.sig.pdc <- lfc_pdc$pvalue < 0.05

results$class <- gsub(" .+", "", results$name)
results$pro <- gsub(".+ ", "", results$name)
results$pro <- factor(results$pro, levels = c("Pro", "noPro"))
results$class <- factor(results$class, levels = c("CTCF_and_G4", "CTCF_not_G4"))

results$log2.mock <- log2(results$mean.mock)
results$log2.pds <- log2(results$mean.pds)
results$log2.pdc <- log2(results$mean.pdc)
results$log2.G4 <- log2(results$G4_NT)

results$deseq.sigup.pds <- results$deseq.sig.pds &
  results$deseq.lfc.pds > 0
results$deseq.sigup.pdc <- results$deseq.sig.pdc &
  results$deseq.lfc.pdc > 0
write.table(results, glue("{result_folder}foldchange_results.txt"))
table(results$name)

CTCF_and_G4 noPro   CTCF_and_G4 Pro CTCF_not_G4 noPro   CTCF_not_G4 Pro 
              988              1300             43309              6016 
#results <- read.table("foldchange_results.txt")
results$class <- factor(results$class, levels = c("CTCF_and_G4", "CTCF_not_G4"))
p <- ggviolin(
  results,
  x = "class",
  y = "mean.mock",
  fill = "class",
  palette = mypal,
  add = "median_iqr"
) + coord_cartesian(ylim = c(0, 10))
annotate_figure(p, fig.lab = "CTCF signal by class in mock condition, mean of two reps, median+iqr", fig.lab.size = 6)

ggsave(glue("{plot_folder}Violin_CTCF_classes.pdf"), last_plot())
Saving 3 x 3 in image
p <- ggviolin(
  results,
  x = "class",
  y = "mean.mock",
  fill = "pro",
  palette = mypal,
  add = "median_iqr"
) + coord_cartesian(ylim = c(0, 10))
annotate_figure(p, fig.lab = "CTCF signal by class in mock condition, mean of two reps, median+iqr", fig.lab.size = 6)

ggsave(glue("{plot_folder}Violin_CTCF_classes_pro.pdf"),
       last_plot())
Saving 3 x 3 in image
mdf <- reshape2::melt(dplyr::select(
  results,
  c(
    "class",
    "mock_1",
    "mock_2",
    "PDS_1",
    "PDS_2",
    "PhenDC3_1",
    "PhenDC3_2"
  )
))
Using class as id variables
ggboxplot(
  mdf,
  x = "variable",
  y = "value",
  fill = "class",
  palette = mypal
) + coord_cartesian(ylim = c(0, 10))

ggsave(glue("{plot_folder}Boxplot_CTCF_reps.pdf"), last_plot())
Saving 5 x 3 in image
mdf <- reshape2::melt(dplyr::select(
  results,
  c(
    "class",
    "mock_1",
    "mock_2",
    "PDS_1",
    "PDS_2",
    "PhenDC3_1",
    "PhenDC3_2"
  )
))
Using class as id variables
mdf_stats_class = compare_means(value ~ class, group.by = "variable", data = mdf)

ggviolin(
  mdf,
  x = "variable",
  y = "value",
  fill = "class",
  palette = mypal,
  add = "median_iqr"
) + coord_cartesian(ylim = c(0, 10))

ggsave(glue("{plot_folder}Violin_CTCF_reps.pdf"), last_plot())
Saving 5 x 3 in image
mdf <- reshape2::melt(dplyr::select(results, c(
  "class", "deseq.lfc.pds", "deseq.lfc.pdc"
)))
Using class as id variables
p <- ggviolin(
  mdf,
  x = "variable",
  y = "value",
  fill = "class",
  palette = mypal,
  add = "median_iqr"
) + geom_hline(yintercept = 0, linetype = "dotted") + coord_cartesian(ylim =
                                                                        c(-3, 3)) + stat_compare_means(aes(group = class), label.y = 3, size = 2)

annotate_figure(p, fig.lab = "DESEq foldchange mean treat vs mean mock, median+iqr", fig.lab.size = 6)
Warning: Removed 3 rows containing non-finite outside the scale range (`stat_ydensity()`).
Warning: Removed 3 rows containing non-finite outside the scale range (`stat_summary()`).
Warning: Removed 3 rows containing non-finite outside the scale range (`stat_compare_means()`).

ggsave(
  glue("{plot_folder}Violin_CTCF_lfc.pdf"),
  last_plot(),
  width = 5,
  height = 5
)
p <- ggboxplot(
  mdf,
  x = "variable",
  y = "value",
  fill = "class",
  palette = mypal
) + geom_hline(yintercept = 0, linetype = "dotted") + coord_cartesian(ylim =
                                                                        c(-2, 2))
annotate_figure(p, fig.lab = "DESEq foldchange mean treat vs mean mock, median+iqr", fig.lab.size = 6)
Warning: Removed 3 rows containing non-finite outside the scale range (`stat_boxplot()`).

ggsave(glue("{plot_folder}Boxplot_CTCF_lfc.pdf"), last_plot())
Saving 3 x 3 in image
mdf <- reshape2::melt(dplyr::select(results, c(
  "pro", "class", "deseq.lfc.pds", "deseq.lfc.pdc"
)))
Using pro, class as id variables
mdf$pro <- factor(mdf$pro, levels = c("Pro", "noPro"))
mdf$x <- as.factor(paste0(mdf$class, " ", mdf$variable))
mdf$x <- factor(mdf$x, levels = levels(mdf$x)[c(2, 1, 4, 3)])
p <- ggviolin(
  mdf,
  x = "x",
  y = "value",
  fill = "class",
  palette = mypal,
  add = "median_iqr",
  facet.by = "pro"
) + geom_hline(yintercept = 0, linetype = "dotted") + coord_cartesian(ylim =
                                                                        c(-2, 2)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))
annotate_figure(p, fig.lab = "DESEq foldchange mean treat vs mean mock, median+iqr", fig.lab.size = 6)
Warning: Removed 3 rows containing non-finite outside the scale range (`stat_ydensity()`).
Warning: Removed 3 rows containing non-finite outside the scale range (`stat_summary()`).

ggsave(glue("{plot_folder}Violin_CTCF_lfc_pro.pdf"), last_plot())
Saving 5 x 6 in image
med <- aggregate(deseq.lfc.pds ~ class,
                 data = results,
                 FUN = "median",
                 na.rm = T)
med$deseq.fc.pds <- 2 ^ med$deseq.lfc.pds
med
med <- aggregate(deseq.lfc.pds ~ name,
                 data = results,
                 FUN = "median",
                 na.rm = T)
med$deseq.fc.pds <- 2 ^ med$deseq.lfc.pds
med
med <- aggregate(deseq.lfc.pds ~ class,
                 data = results,
                 FUN = "median",
                 na.rm = T)
med$deseq.fc.pds <- 2 ^ med$deseq.lfc.pds
med
med <- aggregate(deseq.lfc.pdc ~ name,
                 data = results,
                 FUN = "median",
                 na.rm = T)
med$deseq.fc.pdc <- 2 ^ med$deseq.lfc.pdc
med
deseq_lfc_stats = compare_means(deseq.lfc.pds ~ class, data = results)
deseq_lfc_stats
write_tsv(deseq_lfc_stats,
          glue("{stat_output}pds-deseq2_lfc_statistics.tsv"))
deseq_lfc_stats = compare_means(deseq.lfc.pdc ~ class, data = results)
deseq_lfc_stats
write_tsv(deseq_lfc_stats,
          glue("{stat_output}phendc-deseq2_lfc_statistics.tsv"))
mdf <- reshape2::melt(dplyr::select(
  results,
  c(
    "class",
    "raw.lfc.pds_1",
    "raw.lfc.pds_2",
    "raw.lfc.pdc_1",
    "raw.lfc.pdc_2"
  )
))
Using class as id variables
ggviolin(
  mdf,
  x = "variable",
  y = "value",
  fill = "class",
  palette = mypal,
  add = "median_iqr"
) + geom_hline(yintercept = 0, linetype = "dotted") + coord_cartesian(ylim =
                                                                        c(-3, 3)) +
  stat_compare_means(aes(group = class), label.y = 3.6, size = 2)
Warning: Removed 469 rows containing non-finite outside the scale range (`stat_ydensity()`).
Warning: Removed 469 rows containing non-finite outside the scale range (`stat_summary()`).
Warning: Removed 469 rows containing non-finite outside the scale range
(`stat_compare_means()`).

ggsave(glue("{plot_folder}Violin_CTCF_lfc_individual_reps.pdf"), last_plot())
Saving 5 x 3 in image
Warning: Removed 469 rows containing non-finite outside the scale range (`stat_ydensity()`).
Warning: Removed 469 rows containing non-finite outside the scale range (`stat_summary()`).
Warning: Removed 469 rows containing non-finite outside the scale range
(`stat_compare_means()`).
ggdensity(
  results,
  x = "raw.lfc.pds",
  color = "class",
  fill = "class",
  palette = mypal
) + geom_vline(xintercept = 0, linetype = "dotted")
Warning: Removed 26 rows containing non-finite outside the scale range (`stat_density()`).

ggdensity(
  results,
  x = "raw.lfc.pdc",
  color = "class",
  fill = "class",
  palette = mypal
) + geom_vline(xintercept = 0, linetype = "dotted")
Warning: Removed 23 rows containing non-finite outside the scale range (`stat_density()`).

ggscatter(
  results,
  x = "log2.mock",
  y = "log2.pds",
  size = 1,shape=20,
  alpha = 0.1,
  color = "deseq.sig.pds",
  palette = mypal[c(5, 2)]
)

ggsave(glue("{plot_folder}Scatter_CTCF_DESEq_PDSvsMock.pdf"), rasterize(last_plot(),dpi = 600),width = 3, height= 3.2)
ggscatter(
  results,
  x = "log2.mock",
  y = "log2.pdc",
  size = 1,shape=20,
  alpha = 0.1,
  color = "deseq.sig.pdc",
  palette = mypal[c(5, 2)]
)

ggsave(glue("{plot_folder}Scatter_CTCF_DESEq_PhenDC3vsMock.pdf"), rasterize(last_plot(),dpi = 600),width = 3, height= 3.2)
tb <- table(results$class)
deseq.stats <- as.data.frame(t(as.matrix(tb)))
deseq.stats[1,] <- colSums(deseq.stats)
rownames(deseq.stats) <- c("Total")
deseq.stats.percent <- deseq.stats
deseq.stats.percent[1,] <- c(100,100)
tb

CTCF_and_G4 CTCF_not_G4 
       2288       49325 
tb <- table(results$deseq.sig.pds &
        (results$deseq.lfc.pds > 0),
      results$class)
deseq.stats <- rbind(deseq.stats, PDS.sig.up = as.data.frame.matrix(tb)[2,])
tb
       
        CTCF_and_G4 CTCF_not_G4
  FALSE        2220       46812
  TRUE           68        2510
tb <- prop.table(table(results$deseq.sig.pds &
        (results$deseq.lfc.pds > 0),
      results$class),margin = 2)*100
deseq.stats.percent <- rbind(deseq.stats.percent, PDS.sig.up  = as.data.frame.matrix(tb)[2,])
tb
       
        CTCF_and_G4 CTCF_not_G4
  FALSE   97.027972   94.910993
  TRUE     2.972028    5.089007
table(results$deseq.sig.pds &
        (results$deseq.lfc.pds > 0),
      results$class)
       
        CTCF_and_G4 CTCF_not_G4
  FALSE        2220       46812
  TRUE           68        2510
mdf <- reshape2::melt(table(
  results$deseq.sig.pds & (results$deseq.lfc.pds > 0),
  results$class
))
ggplot(mdf, aes(Var1, Var2, fill = value)) +
  geom_tile(show.legend = F) + geom_text(aes(label = value)) +
  scale_fill_gradient(low = "white", high = "orange") + theme_minimal()

vl <- list(
  sig = grep("TRUE", results$deseq.sigup.pds),
  CTCF_G4 = grep("and", results$class),
  CTCFonly = grep("not", results$class)
)
plot(euler(vl), quantities = T)

tb <- table(results$deseq.sig.pdc &
        (results$deseq.lfc.pdc > 0),
      results$class)
deseq.stats <- rbind(deseq.stats, PhenDC3.sig.up = as.data.frame.matrix(tb)[2,])
tb
       
        CTCF_and_G4 CTCF_not_G4
  FALSE        2251       48545
  TRUE           37         780
tb <- prop.table(table(results$deseq.sig.pdc &
        (results$deseq.lfc.pdc > 0),
      results$class),margin = 2)*100
deseq.stats.percent <- rbind(deseq.stats.percent, PhenDC3.sig.up = as.data.frame.matrix(tb)[2,])
tb
       
        CTCF_and_G4 CTCF_not_G4
  FALSE   98.382867   98.418652
  TRUE     1.617133    1.581348
table(results$deseq.sig.pdc &
        (results$deseq.lfc.pdc > 0),
      results$class)
       
        CTCF_and_G4 CTCF_not_G4
  FALSE        2251       48545
  TRUE           37         780
mdf <- reshape2::melt(table(
  results$deseq.sig.pdc & (results$deseq.lfc.pdc > 0),
  results$class
))
ggplot(mdf, aes(Var1, Var2, fill = value)) +
  geom_tile(show.legend = F) + geom_text(aes(label = value)) +
  scale_fill_gradient(low = "white", high = "orange") + theme_minimal()

results$uid <- seq(1:nrow(results))
vl <- list(
  sig = grep("TRUE", results$deseq.sigup.pdc),
  CTCF_G4 = grep("and", results$class),
  CTCFonly = grep("not", results$class)
)
plot(euler(vl), quantities = T)

table(results$deseq.sigup.pds, results$deseq.sigup.pdc)
       
        FALSE  TRUE
  FALSE 48576   456
  TRUE   2217   361
mdf <- reshape2::melt(table(results$deseq.sigup.pds, results$deseq.sigup.pdc))
ggplot(mdf, aes(Var1, Var2, fill = value)) +
  geom_tile(show.legend = F) + geom_text(aes(label = value)) +
  scale_fill_gradient(low = "white", high = "orange") + theme_minimal()

tb <- table(results$deseq.sigup.pds & results$deseq.sigup.pdc,
      results$class)
deseq.stats <- rbind(deseq.stats, both.sig.up = as.data.frame.matrix(tb)[2,])
tb
       
        CTCF_and_G4 CTCF_not_G4
  FALSE        2277       48975
  TRUE           11         350
tb <- prop.table(table(results$deseq.sigup.pds & results$deseq.sigup.pdc,
      results$class),margin = 2)*100
deseq.stats.percent <- rbind(deseq.stats.percent, both = as.data.frame.matrix(tb)[2,])
tb
       
        CTCF_and_G4 CTCF_not_G4
  FALSE  99.5192308  99.2904207
  TRUE    0.4807692   0.7095793
table(results$deseq.sigup.pds & results$deseq.sigup.pdc, results$class)
       
        CTCF_and_G4 CTCF_not_G4
  FALSE        2277       48975
  TRUE           11         350
mdf <- reshape2::melt(table(results$deseq.sigup.pds & results$deseq.sigup.pdc, results$class))
ggplot(mdf, aes(Var1, Var2, fill = value)) +
  geom_tile(show.legend = F) + geom_text(aes(label = value)) +
  scale_fill_gradient(low = "white", high = "orange") + theme_minimal()

deseq.stats$class <- rownames(deseq.stats) 
mdf <- reshape2::melt(deseq.stats)
Using class as id variables
ggplot(mdf, aes(variable, class, fill = value)) +
  geom_tile(show.legend = F) + geom_text(aes(label = value)) +
  scale_fill_gradient(low = "white", high = "orange") + theme_minimal()

ggsave(glue("{plot_folder}Table_CTCF_DESEq_Sigup.pdf"), rasterize(last_plot(),dpi = 600),width = 2.5, height= 2.5)
deseq.stats.percent$class <- rownames(deseq.stats.percent) 
mdf <- reshape2::melt(deseq.stats.percent[-1,])
Using class as id variables
ggplot(mdf, aes(variable, class, fill = value)) +
  geom_tile(show.legend = F) + geom_text(aes(label = round(mdf$value,2))) +
  scale_fill_gradient(low = "white", high = "orange") + theme_minimal()
Warning: Use of `mdf$value` is discouraged.
ℹ Use `value` instead.

ggsave(glue("{plot_folder}Table_CTCF_DESEq_Sigup_percent.pdf"), rasterize(last_plot(),dpi = 600),width = 2.5, height= 2.5)
Warning: Use of `mdf$value` is discouraged.
ℹ Use `value` instead.
vl <- list(
  sig_PDS = grep("TRUE", results$deseq.sigup.pds),
  sig_PDC = grep("TRUE", results$deseq.sigup.pdc),
  CTCF_G4 = grep("and", results$class),
  CTCFonly = grep("not", results$class)
)
plot(euler(vl), quantities = T)

results$uid <- seq(1:nrow(results))
vl <- list(
  sig_PDS = grep("TRUE", results$deseq.sigup.pds),
  sig_PDC = grep("TRUE", results$deseq.sigup.pdc),
  CTCF_G4 = grep("and", results$class)
)
plot(euler(vl), quantities = T)

results$sig.by.class.pds <- paste0(results$deseq.sigup.pds, "_", results$class)
results$psize <- 0.01
results$psize[results$deseq.sigup.pds &
                results$class == "CTCF_and_G4"] <- 1

ggscatter(
  results,
  x = "log2.mock",
  y = "log2.pds",
  size = 0.5,
  alpha = results$psize,
  color = "sig.by.class.pds",
  palette = c(mypal[5], mypal[5], mypal[5], mypal[2], mypal[1])
)

ggscatter(
  results[!grepl("NA", results$sig.by.class.pds), ],
  x = "log2.mock",
  y = "deseq.lfc.pds",
  size = 0.2,
  alpha = "psize",
  color = "sig.by.class.pds",
  palette = c("#505050", "#505050", "red2", "blue")
)

ggscatterhist(
  results[!grepl("NA", results$sig.by.class.pds), ],
  x = "log2.mock",
  y = "log2.pds",
  size = 0.4,
  alpha = "psize",
  color = "sig.by.class.pds",
  margin.params = list(
    fill = "sig.by.class.pds",
    color = "black",
    size = 0.2
  ),
  palette = c("#505050", "#505050", "red2", "blue")
)
Warning: Removed 13 rows containing non-finite outside the scale range (`stat_density()`).
Warning: Removed 10 rows containing non-finite outside the scale range (`stat_density()`).

ggscatter(
  results,
  x = "raw.lfc.pds",
  y = "log2.G4",
  size = 0.2,
  alpha = 0.1,
  color = "deseq.sig.pds",
  cor.coef = T,
  palette = c("#888888",mypal[2])
)
Warning: Removed 16803 rows containing non-finite outside the scale range (`stat_cor()`).
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_point()`).

ggsave(glue("{plot_folder}Scatter_G4_vs_LFC_PDS.pdf"),
       plot = rasterize(last_plot()))
Saving 3 x 3 in image
Warning: Removed 16803 rows containing non-finite outside the scale range (`stat_cor()`).
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_point()`).
ggscatter(
  results,
  x = "raw.lfc.pdc",
  y = "log2.G4",
  size = 0.2,
  alpha = 0.1,
  color = "deseq.sig.pdc",
  cor.coef = T,
  palette = c("#888888",mypal[2])
)
Warning: Removed 16800 rows containing non-finite outside the scale range (`stat_cor()`).

ggsave(glue("{plot_folder}Scatter_G4_vs_LFC_PDC.pdf"),
       plot = rasterize(last_plot()))
Saving 3 x 3 in image
Warning: Removed 16800 rows containing non-finite outside the scale range (`stat_cor()`).
results$G4.quantile <- dplyr::ntile(results$G4_NT, n = 5)
results$pds.quantile <- dplyr::ntile(results$mean.pds-results$mean.mock, n = 4)
results$pdc.quantile <- dplyr::ntile(results$mean.pdc-results$mean.mock, n = 4)
tb <- table(results$pds.quantile, results$class)
tb
   
    CTCF_and_G4 CTCF_not_G4
  1         540       12364
  2         639       12264
  3         552       12351
  4         557       12346
ggbarplot(as.data.frame(t(as.matrix(tb))),"Var2","Freq",fill="Var1",palette = c(mypal[1],"#aaaaaa"),label = TRUE, lab.pos="in",ylim=c(11000,13000))

ggsave(glue("{plot_folder}Barplot_peakCat_by_PDSquantile.pdf"),
       plot = last_plot())
Saving 4 x 3 in image
ggviolin(
  results,
  x = "pds.quantile",
  y = "deseq.lfc.pds",
  fill = mypal[3],
  add = "median_iqr"
) + coord_cartesian(ylim = c(-1.5, 1.5)) + geom_hline(yintercept = 0, linetype =
                                                    "dotted")
Warning: Removed 3 rows containing non-finite outside the scale range (`stat_ydensity()`).
Warning: Removed 3 rows containing non-finite outside the scale range (`stat_summary()`).

ggsave(glue("{plot_folder}Violin_lfcPDS_by_PDSquantile.pdf"),
       plot = last_plot())
Saving 3 x 3 in image
Warning: Removed 3 rows containing non-finite outside the scale range (`stat_ydensity()`).
Warning: Removed 3 rows containing non-finite outside the scale range (`stat_summary()`).
top25.pds <- makeGRangesFromDataFrame(results[results$pds.quantile==4,1:3],na.rm=T)
bot75.pds <- makeGRangesFromDataFrame(results[results$pds.quantile!=4,1:3],na.rm=T)
p1 <- plot_bw_profile(bwfiles = geo_bigwigs,loci = top25.pds, mode = "center",verbose = F, colors = mypal) + coord_cartesian(ylim=c(0,6))
Warning in .summarize_matrix(fg, label) :
  Profile plot: 658 generated ( 0.0509958924281175 per locus)
Warning in .summarize_matrix(fg, label) :
  Profile plot: 658 generated ( 0.0509958924281175 per locus)
Warning in .summarize_matrix(fg, label) :
  Profile plot: 658 generated ( 0.0509958924281175 per locus)
p2 <- plot_bw_profile(bwfiles = geo_bigwigs,loci = bot75.pds, mode = "center",verbose = F, colors = mypal) + coord_cartesian(ylim=c(0,6))
Warning in .summarize_matrix(fg, label) :
  Profile plot: 2056 generated ( 0.0531128907259106 per locus)
Warning in .summarize_matrix(fg, label) :
  Profile plot: 2056 generated ( 0.0531128907259106 per locus)
Warning in .summarize_matrix(fg, label) :
  Profile plot: 2056 generated ( 0.0531128907259106 per locus)
ggarrange(p1,p2, ncol=2)

ggsave(glue("{plot_folder}Profile_CTCF_top25_PDSquantile.pdf"),
       plot = last_plot())
Saving 6 x 3 in image
top25.pds.CTCF_G4 <- makeGRangesFromDataFrame(results[(results$pds.quantile==4 & results$class == "CTCF_and_G4"),1:3],na.rm=T)
top25.pds.CTCF_only <- makeGRangesFromDataFrame(results[(results$pds.quantile==4  &  results$class == "CTCF_not_G4"),1:3],na.rm=T)

p1 <- plot_bw_profile(bwfiles = geo_bigwigs[1:2],loci = top25.pds.CTCF_G4, mode = "center",verbose = F, colors = mypal[c(1,2)]) + coord_cartesian(ylim=c(0,10))
p2 <- plot_bw_profile(bwfiles = geo_bigwigs[1:2],loci = top25.pds.CTCF_only, mode = "center",verbose = F, colors = mypal[c(1,2)]) + coord_cartesian(ylim=c(0,10))
Warning in .summarize_matrix(fg, label) :
  Profile plot: 639 generated ( 0.0517576543009882 per locus)
Warning in .summarize_matrix(fg, label) :
  Profile plot: 639 generated ( 0.0517576543009882 per locus)
ggarrange(p1,p2, ncol=2)

ggsave(glue("{plot_folder}Profile_CTCF_top25_PDSquantile_byG4overlap.pdf"),
       plot = last_plot())
Saving 6 x 3 in image
top25.pds.CTCF_G4 <- makeGRangesFromDataFrame(results[(results$pds.quantile==4 & results$class == "CTCF_and_G4"),1:3],na.rm=T)
top25.pds.CTCF_only <- makeGRangesFromDataFrame(results[(results$pds.quantile==4  &  results$class == "CTCF_not_G4"),1:3],na.rm=T)

p1 <- plot_bw_profile(bwfiles = geo_bigwigs[2],bg_bwfiles = geo_bigwigs[1], loci = top25.pds.CTCF_G4, mode = "center",verbose = F, colors = mypal[2],norm_mode = "log2fc",show_error = T) + coord_cartesian(ylim=c(-1,1))
Warning in plot_bw_profile(bwfiles = geo_bigwigs[2], bg_bwfiles = geo_bigwigs[1],  :
  Stderr estimate not available when normalizing by input
p2 <- plot_bw_profile(bwfiles = geo_bigwigs[2],bg_bwfiles = geo_bigwigs[1], loci = top25.pds.CTCF_only, mode = "center",verbose = F, colors = mypal[2],norm_mode = "log2fc",show_error = T) + coord_cartesian(ylim=c(-1,1))
Warning in .summarize_matrix(fg, label) :
  Profile plot: 639 generated ( 0.0517576543009882 per locus)
Warning in .summarize_matrix(bg, "bg") :
  Profile plot: 639 generated ( 0.0517576543009882 per locus)
Warning in plot_bw_profile(bwfiles = geo_bigwigs[2], bg_bwfiles = geo_bigwigs[1],  :
  Stderr estimate not available when normalizing by input
ggarrange(p1,p2, ncol=2)

ggsave(glue("{plot_folder}Profile_CTCFfoldchange_top25_PDSquantile_byG4.pdf"),
       plot = last_plot())
Saving 6 x 3 in image
top25.pds.CTCF_G4 <- makeGRangesFromDataFrame(results[(results$pds.quantile==4 & results$class == "CTCF_and_G4"),1:3],na.rm=T)
top25.pds.CTCF_only <- makeGRangesFromDataFrame(results[(results$pds.quantile==4  &  results$class == "CTCF_not_G4"),1:3],na.rm=T)

p1 <- plot_bw_profile(bwfiles = G4_bigwigs, loci = top25.pds.CTCF_G4, mode = "center",verbose = F, colors = mypal[3],norm_mode = "log2fc") + coord_cartesian(ylim=c(0,10))
p2 <- plot_bw_profile(bwfiles = G4_bigwigs, loci = top25.pds.CTCF_only, mode = "center",verbose = F, colors = mypal[3],norm_mode = "log2fc") + coord_cartesian(ylim=c(0,10))
ggarrange(p1,p2, ncol=2)

ggsave(glue("{plot_folder}Profile_G4_top25_PDSquantile.pdf"),
       plot = last_plot())
Saving 6 x 3 in image
ord.top25 <- order(rowMeans(as.data.frame(bw_heatmap(bwfiles = geo_bigwigs[1],loci = top25.pds, mode = "center"))))
p1 <- plot_bw_heatmap(bwfiles = geo_bigwigs[1],loci = top25.pds, mode = "center",verbose = F, zmax = 10, max_rows_allowed = 50, order_by = ord.top25)
Warning in .preprocess_heatmap_matrix(values[[1]], zmin, zmax, order_by,  :
  Large matrix: 12903. Downscaled to 50
p2 <- plot_bw_heatmap(bwfiles = geo_bigwigs[2],loci = top25.pds, mode = "center",verbose = F, zmax = 10, max_rows_allowed = 50, order_by = ord.top25)
Warning in .preprocess_heatmap_matrix(values[[1]], zmin, zmax, order_by,  :
  Large matrix: 12903. Downscaled to 50
ggarrange(p1,p2, ncol=2, common.legend = T)

ggsave(glue("{plot_folder}Heatmap_CTCF_top25_PDSquantile.pdf"),
       plot = last_plot())
Saving 3 x 3 in image
ord.bot75 <- order(rowMeans(as.data.frame(bw_heatmap(bwfiles = geo_bigwigs[1],loci = bot75.pds, mode = "center"))))
p1 <- plot_bw_heatmap(bwfiles = geo_bigwigs[1],loci = bot75.pds, mode = "center",verbose = F, zmax = 10, max_rows_allowed = 200, order_by = ord.bot75)
Warning in .preprocess_heatmap_matrix(values[[1]], zmin, zmax, order_by,  :
  Large matrix: 38710. Downscaled to 200
p2 <- plot_bw_heatmap(bwfiles = geo_bigwigs[2],loci = bot75.pds, mode = "center",verbose = F, zmax = 10, max_rows_allowed = 200, order_by = ord.bot75)
Warning in .preprocess_heatmap_matrix(values[[1]], zmin, zmax, order_by,  :
  Large matrix: 38710. Downscaled to 200
ggarrange(p1,p2, ncol=2, common.legend = T)

ggsave(glue("{plot_folder}Heatmap_CTCF_bot75_PDSquantile.pdf"),
       plot = last_plot())
Saving 3 x 4 in image
G4_CnR <- bw_loci(c("../mendeley/FigureS2/GSM6634325_E14_Mock_G4.bw","../mendeley/FigureS2/GSM6634326_E14_PDS_G4.bw"),loci = makeGRangesFromDataFrame(results,na.rm=T),labels = c("Mock","PDS"))
results$G4_CnR_delta <- G4_CnR$PDS-G4_CnR$Mock
results$G4_CnR_lfc <- log2(G4_CnR$PDS/G4_CnR$Mock)
results$G4_CnR_mock <- G4_CnR$Mock
ggviolin(
  results,
  x = "pds.quantile",
  y = "G4_CnR_delta",
  fill = mypal[2],
  add = "median_iqr"
) + coord_cartesian(ylim = c(-10, 15)) + geom_hline(yintercept = 0, linetype =
                                                    "dotted")

ggsave(glue("{plot_folder}Violin_G4_CnR_delta_by_PDSquantile.pdf"),
       plot = last_plot())
Saving 3 x 2 in image
compare_means(G4_CnR_delta ~ pds.quantile, results)
results$pds.quartile.top25 <- results$pds.quantile == 4
mean(na.omit(results$G4_CnR_delta))
[1] 2.193939
mean(results$G4_CnR_delta[results$pds.quartile.top25])
[1] 2.32057
mean(results$G4_CnR_delta[!results$pds.quartile.top25])
[1] 2.151729
compare_means(G4_CnR_delta ~ pds.quartile.top25, results)
export.bed(top25.pds,"../peaks/CTCF_peaks_top25_pds_up.bed")
export.bed(bot75.pds,"../peaks/CTCF_peaks_bot75_pds_up.bed")
ggviolin(
  results,
  x = "G4.quantile",
  y = "deseq.lfc.pds",
  fill = mypal[1],
  add = "median_iqr"
) + coord_cartesian(ylim = c(-2, 2)) + geom_hline(yintercept = 0, linetype =
                                                    "dotted")
ggsave(glue("{plot_folder}Violin_CTCF_lfc_by_G4quantile.pdf"),
       plot = last_plot())
ggviolin(
  results,
  x = "G4.quantile",
  y = "log2.G4",
  fill = mypal2[5],
  add = "median_iqr"
) + geom_hline(yintercept = 0, linetype = "dotted")
ggsave(glue("{plot_folder}Violin_G4_by_G4quantile.pdf"), plot = last_plot())
peak_cats <- bedscout::import_named_bed_into_list(peaks_bed)
plot_bw_profile(
  G4_bigwigs,
  peak_cats,
  labels = c(
    peak_cats[[1]][1, ]$name,
    peak_cats[[2]][1, ]$name,
    peak_cats[[3]][1, ]$name,
    peak_cats[[4]][1, ]$name
  ),
  mode = "center",
  show_error = T,
  verbose = F,
  remove_top = 0.001,
  colors = mypal
)
plot_bw_profile(
  combined_bigwigs[1],
  peak_cats,
  labels = c(
    peak_cats[[1]][1, ]$name,
    peak_cats[[2]][1, ]$name,
    peak_cats[[3]][1, ]$name,
    peak_cats[[4]][1, ]$name
  ),
  mode = "center",
  show_error = T,
  verbose = F,
  remove_top = 0.001,
  colors = mypal
)
p1 <- plot_bw_profile(
  c(mocks_bigwigs, pds_bigwigs),
  peak_cats[[1]],
  labels = c("mock1", "mock2", "trt1", "rtr2"),
  mode = "center",
  show_error = T,
  verbose = F,
  remove_top = 0.001,
  colors = mypal2[c(9, 10, 5, 6)],
  upstream = 1500,
  downstream = 1500
)
p2 <- plot_bw_profile(
  c(mocks_bigwigs, pds_bigwigs),
  peak_cats[[2]],
  labels = c("mock1", "mock2", "trt1", "rtr2"),
  mode = "center",
  show_error = T,
  verbose = F,
  remove_top = 0.001,
  colors = mypal2[c(9, 10, 5, 6)],
  upstream = 1500,
  downstream = 1500
)
p3 <- plot_bw_profile(
  c(mocks_bigwigs, pds_bigwigs),
  peak_cats[[3]],
  labels = c("mock1", "mock2", "trt1", "rtr2"),
  mode = "center",
  show_error = T,
  verbose = F,
  remove_top = 0.001,
  colors = mypal2[c(9, 10, 5, 6)],
  upstream = 1500,
  downstream = 1500
)
p4 <- plot_bw_profile(
  c(mocks_bigwigs, pds_bigwigs),
  peak_cats[[4]],
  labels = c("mock1", "mock2", "trt1", "rtr2"),
  mode = "center",
  show_error = T,
  verbose = F,
  remove_top = 0.001,
  colors = mypal2[c(9, 10, 5, 6)],
  upstream = 1500,
  downstream = 1500
)

ggarrange(p1, p2, p3, p4, ncol = 4, nrow = 1)
p1 <- plot_bw_profile(
  c(mocks_bigwigs, pds_bigwigs),
  peak_cats[[1]],
  labels = c("mock1", "mock2", "trt1", "rtr2"),
  mode = "center",
  show_error = T,
  verbose = F,
  remove_top = 0.001,
  colors = mypal2[c(9, 10, 3, 4)],
  upstream = 1500,
  downstream = 1500
)
p2 <- plot_bw_profile(
  c(mocks_bigwigs, pds_bigwigs),
  peak_cats[[2]],
  labels = c("mock1", "mock2", "trt1", "rtr2"),
  mode = "center",
  show_error = T,
  verbose = F,
  remove_top = 0.001,
  colors = mypal2[c(9, 10, 3, 4)],
  upstream = 1500,
  downstream = 1500
)
p3 <- plot_bw_profile(
  c(mocks_bigwigs, pds_bigwigs),
  peak_cats[[3]],
  labels = c("mock1", "mock2", "trt1", "rtr2"),
  mode = "center",
  show_error = T,
  verbose = F,
  remove_top = 0.001,
  colors = mypal2[c(9, 10, 3, 4)],
  upstream = 1500,
  downstream = 1500
)
p4 <- plot_bw_profile(
  c(mocks_bigwigs, pds_bigwigs),
  peak_cats[[4]],
  labels = c("mock1", "mock2", "trt1", "rtr2"),
  mode = "center",
  show_error = T,
  verbose = F,
  remove_top = 0.001,
  colors = mypal2[c(9, 10, 3, 4)],
  upstream = 1500,
  downstream = 1500
)

ggarrange(p1, p2, p3, p4, ncol = 4, nrow = 1)

cen we learn something from the genes with top-most increase in CTCF?

sigup.pds <- results[results$deseq.sigup.pds, ]
sigup.pdc <- results[results$deseq.sigup.pdc, ]

#export.bed(sigup.pds,"peaks_sigup_pds.bed")
#export.bed(sigup.pds,"peaks_sigup_pds.bed")
gghistogram(results, "width", add = "median", fill = "class")

Distance-based analysis

G4_bed <- import('../data/G4_CnT/G4_WT_peaks.narrowPeak')
ATAC_bed <- import('../data/ATAC-seq/ATAC_seq_mESC_Martire_peaks.narrowPeak')
pro_bed <- import('../data/regions/promoters_geneSymbol.mm10.bed')

G4_bed <- bedscout::annotate_overlapping_features(G4_bed, pro_bed, name_field = "name")

G4_bed$name <- "G4"
G4_bed$name[!is.na(G4_bed$nearby_features)] <- "G4pro"
G4_bed$name[grepl("pro", G4_bed$name)] <- "G4pro"
nearest_G4_1kb <- bedscout::annotate_nearby_features(
  ctcf,
  G4_bed,
  distance_cutoff = 1000,
  ignore.strand = T,
  name_field = "name"
)
nearest_G4_2.5kb <- bedscout::annotate_nearby_features(
  ctcf,
  G4_bed,
  distance_cutoff = 2500,
  ignore.strand = T,
  name_field = "name"
)
nearest_G4_5kb <- bedscout::annotate_nearby_features(
  ctcf,
  G4_bed,
  distance_cutoff = 5000,
  ignore.strand = T,
  name_field = "name"
)
nearest_G4_10kb <- bedscout::annotate_nearby_features(
  ctcf,
  G4_bed,
  distance_cutoff = 10000,
  ignore.strand = T,
  name_field = "name"
)
nearest_G4_50kb <- bedscout::annotate_nearby_features(
  ctcf,
  G4_bed,
  distance_cutoff = 50000,
  ignore.strand = T,
  name_field = "name"
)

ctcf$nearest_G4 <- factor(">50kb",
                          levels = c("<1kb", "<2.5kb", "<5kb", "<10kb", "<50kb", ">50kb"))
ctcf$nearest_G4[!is.na(nearest_G4_50kb$nearby_features)] <- "<50kb"
ctcf$nearest_G4[!is.na(nearest_G4_10kb$nearby_features)] <- "<10kb"
ctcf$nearest_G4[!is.na(nearest_G4_5kb$nearby_features)] <- "<5kb"
ctcf$nearest_G4[!is.na(nearest_G4_2.5kb$nearby_features)] <- "<2.5kb"
ctcf$nearest_G4[!is.na(nearest_G4_1kb$nearby_features)] <- "<1kb"

ctcf$nearest_G4_type <- "none"
ctcf$nearest_G4_type[!is.na(nearest_G4_50kb$nearby_features)] <- nearest_G4_50kb$nearby_features[!is.na(nearest_G4_50kb$nearby_features)]
ctcf$nearest_G4_type[!is.na(nearest_G4_10kb$nearby_features)] <- nearest_G4_10kb$nearby_features[!is.na(nearest_G4_10kb$nearby_features)]
ctcf$nearest_G4_type[!is.na(nearest_G4_5kb$nearby_features)] <- nearest_G4_5kb$nearby_features[!is.na(nearest_G4_5kb$nearby_features)]
ctcf$nearest_G4_type[!is.na(nearest_G4_2.5kb$nearby_features)] <- nearest_G4_2.5kb$nearby_features[!is.na(nearest_G4_2.5kb$nearby_features)]
ctcf$nearest_G4_type[!is.na(nearest_G4_1kb$nearby_features)] <- nearest_G4_1kb$nearby_features[!is.na(nearest_G4_1kb$nearby_features)]

ctcf$nearest_G4_type[grep("pro", ctcf$nearest_G4_type)] <- "G4pro"

results$nearest_G4 <- ctcf$nearest_G4
results$nearest_G4_type <- ctcf$nearest_G4_type
table(results$nearest_G4)
table(results$nearest_G4_type)
table(results$nearest_G4, results$nearest_G4_type)
ggviolin(
  results,
  x = "nearest_G4",
  y = "mean.mock",
  fill = mypal[1],
  add = "median_iqr"
) + coord_cartesian(ylim = c(0, 10)) +
  stat_compare_means(label.y = 8,
                     label.x = 3,
                     size = 3) +
  geom_hline(yintercept = 0, linetype = "dotted")
ggsave(glue("{plot_folder}Violin_CTCF_signal_by_G4distance.pdf"),
       plot = last_plot())
ggviolin(
  results,
  x = "nearest_G4",
  y = "deseq.lfc.pds",
  fill = mypal[1],
  add = "median_iqr"
) + coord_cartesian(ylim = c(-4, 4)) + geom_hline(yintercept = 0, linewidth = 0.2) +
  geom_hline(
    yintercept = median(results$deseq.lfc.pds[results$nearest_G4 == "<1kb"]),
    linetype = "dotted",
    linewidth = 0.2
  ) +
  stat_compare_means(label.y = 3,
                     label.x = 2,
                     size = 3)
ggsave(glue("{plot_folder}Violin_CTCF_PDS_lfc_by_G4distance.pdf"),
       plot = last_plot())
nearest_G4_stats = compare_means(deseq.lfc.pds ~ nearest_G4, results)
nearest_G4_stats
write_tsv(nearest_G4_stats,
          glue("{stat_output}pds_G4distance_plots-statistics.tsv"))
ggviolin(
  results,
  x = "nearest_G4",
  y = "deseq.lfc.pdc",
  fill = mypal[1],
  add = "median_iqr"
) + coord_cartesian(ylim = c(-2, 2)) + geom_hline(yintercept = 0, linewidth = 0.2) + geom_hline(
  yintercept = mean(results$deseq.lfc.pdc[results$nearest_G4 == "<1kb"]),
  linetype = "dotted",
  linewidth = 0.2
)
ggsave(glue("{plot_folder}Violin_CTCF_PhenDC3_lfc_by_G4distance.pdf"), plot = last_plot())
nearest_G4_stats = compare_means(deseq.lfc.pdc ~ nearest_G4, results)
nearest_G4_stats
write_tsv(nearest_G4_stats, glue("{stat_output}phendc_G4distance_plots-statistics.tsv"))
ggviolin(
  results,
  x = "nearest_G4",
  y = "deseq.lfc.pds",
  fill = "nearest_G4_type",
  palette = mypal[c(1, 3, 5)],
  add = "median_iqr"
) + coord_cartesian(ylim = c(-2, 4)) + geom_hline(yintercept = 0, linewidth = 0.2) + geom_hline(
  yintercept = mean(results$deseq.lfc.pds[results$nearest_G4 == "<1kb"]),
  linetype = "dotted",
  linewidth = 0.2
) +
  stat_compare_means(aes(group = nearest_G4_type))
ggsave(glue("{plot_folder}Violin_CTCF_PDS_lfc_by_G4distance_pro.pdf"),
       plot = last_plot())
compare_means(deseq.lfc.pds ~ nearest_G4, results[results$nearest_G4_type ==
                                                    "G4", ])
compare_means(deseq.lfc.pds ~ nearest_G4, results[results$nearest_G4_type ==
                                                    "G4", ]) %>%
  write_tsv(.,
            glue(
              "{stat_output}pds_G4distance_plots-nearest_G4-statistics.tsv"
            ))
compare_means(deseq.lfc.pds ~ nearest_G4, results[results$nearest_G4_type ==
                                                    "G4pro", ])
compare_means(deseq.lfc.pds ~ nearest_G4, results[results$nearest_G4_type ==
                                                    "G4pro", ]) %>%
  write_tsv(.,
            glue(
              "{stat_output}pds_G4distance_plots-nearest_G4pro-statistics.tsv"
            ))
ggviolin(
  results,
  x = "nearest_G4",
  y = "deseq.lfc.pdc",
  fill = "nearest_G4_type",
  palette = mypal[c(1, 3, 5)],
  add = "median_iqr"
) + coord_cartesian(ylim = c(-2, 2)) + geom_hline(yintercept = 0, linewidth = 0.2) + geom_hline(
  yintercept = mean(results$deseq.lfc.pdc[results$nearest_G4 == "<1kb"]),
  linetype = "dotted",
  linewidth = 0.2
)
ggsave(glue(
  "{plot_folder}Violin_CTCF_PhenDC3_lfc_by_G4distance_pro.pdf"
),
plot = last_plot())
compare_means(deseq.lfc.pdc ~ nearest_G4, results[results$nearest_G4_type ==
                                                    "G4", ])
compare_means(deseq.lfc.pdc ~ nearest_G4, results[results$nearest_G4_type ==
                                                    "G4", ]) %>%
  write_tsv(.,
            glue(
              "{stat_output}phendc_G4distance_plots-nearest_G4-statistics.tsv"
            ))
compare_means(deseq.lfc.pdc ~ nearest_G4, results[results$nearest_G4_type ==
                                                    "G4pro", ])
compare_means(deseq.lfc.pdc ~ nearest_G4, results[results$nearest_G4_type ==
                                                    "G4pro", ]) %>%
  write_tsv(.,
            glue(
              "{stat_output}phendc_G4distance_plots-nearest_G4pro-statistics.tsv"
            ))

Distance-based analysis, this time with predicted G4 motifs, not experimental peaks

#results <- read.table(glue("{result_folder}foldchange_results.txt"))
results$class <- factor(results$class, levels = c("CTCF_and_G4", "CTCF_not_G4"))
G4_bed <- import('../data/G4_CnT/G4_WT_peaks.narrowPeak')
G4_bed$name <- "peak"
G4pred_bed <- import('../data/predG4/mm10_canonical_G4_PQS-regex.bed')
ATAC_bed <- import('../data/ATAC-seq/ATAC_seq_mESC_Martire_peaks.narrowPeak')
pro_bed <- import('../data/regions/promoters_geneSymbol.mm10.bed')

G4pred_bed <- bedscout::annotate_overlapping_features(G4pred_bed, G4_bed, name_field = "name")

G4pred_bed$name <- "G4pred"
G4pred_bed$name[grepl("peak", G4pred_bed$nearby_features)] <- "G4exp"

table(G4pred_bed$name)
nearest_G4_0kb <- bedscout::annotate_nearby_features(
  ctcf,
  G4pred_bed,
  distance_cutoff = 0,
  ignore.strand = T,
  name_field = "name"
)
nearest_G4_1kb <- bedscout::annotate_nearby_features(
  ctcf,
  G4pred_bed,
  distance_cutoff = 1000,
  ignore.strand = T,
  name_field = "name"
)
nearest_G4_2.5kb <- bedscout::annotate_nearby_features(
  ctcf,
  G4pred_bed,
  distance_cutoff = 2500,
  ignore.strand = T,
  name_field = "name"
)
nearest_G4_5kb <- bedscout::annotate_nearby_features(
  ctcf,
  G4pred_bed,
  distance_cutoff = 5000,
  ignore.strand = T,
  name_field = "name"
)
nearest_G4_10kb <- bedscout::annotate_nearby_features(
  ctcf,
  G4pred_bed,
  distance_cutoff = 10000,
  ignore.strand = T,
  name_field = "name"
)


ctcf$nearest_G4 <- factor(">10kb",
                          levels = c("0kb", "<1kb", "<2.5kb", "<5kb", "<10kb", ">10kb"))
ctcf$nearest_G4[!is.na(nearest_G4_10kb$nearby_features)] <- "<10kb"
ctcf$nearest_G4[!is.na(nearest_G4_5kb$nearby_features)] <- "<5kb"
ctcf$nearest_G4[!is.na(nearest_G4_2.5kb$nearby_features)] <- "<2.5kb"
ctcf$nearest_G4[!is.na(nearest_G4_1kb$nearby_features)] <- "<1kb"
ctcf$nearest_G4[!is.na(nearest_G4_0kb$nearby_features)] <- "0kb"

results$nearest_G4 <- ctcf$nearest_G4
table(results$nearest_G4)
table(results$nearest_G4, results$pro)
mdf <- reshape2::melt(table(results$nearest_G4, results$pro))
ggplot(mdf, aes(Var1, Var2, fill = value)) +
  geom_tile(show.legend = F) + geom_text(aes(label = value)) +
  scale_fill_gradient(low = "white", high = "orange") + theme_minimal()
ggsave(glue("{plot_folder}Heatmap_CTCF_sites_by_predG4distance.pdf"),
       plot = last_plot())
ggviolin(
  results,
  x = "nearest_G4",
  y = "mean.mock",
  fill = mypal[1],
  add = "median_iqr"
) + coord_cartesian(ylim = c(0, 10)) + geom_hline(yintercept = 0, linetype =
                                                    "dotted") +
  stat_compare_means(
    aes(group = nearest_G4),
    label.y = 8,
    label.x = 2,
    size = 3
  )
ggsave(glue(
  "{plot_folder}Violin_CTCF_signal_by_predG4distance.pdf"),
  plot = last_plot())
ggviolin(
  results,
  x = "nearest_G4",
  y = "deseq.lfc.pds",
  fill = mypal[1],
  add = "median_iqr"
) + coord_cartesian(ylim = c(-0.5, 1.5)) + geom_hline(yintercept = 0, linewidth = 0.2) + geom_hline(
  yintercept = median(results$deseq.lfc.pds[results$nearest_G4 == "0kb"]),
  linetype = "dotted",
  linewidth = 0.2
) +
  stat_compare_means(label.y = 1.3,
                     label.x = 3,
                     size = 2)
ggsave(glue("{plot_folder}Violin_CTCF_PDS_lfc_by_predG4distance.pdf"),
       plot = last_plot())
compare_means(deseq.lfc.pds ~ nearest_G4, results)
ggviolin(
  results,
  x = "nearest_G4",
  y = "deseq.lfc.pdc",
  fill = mypal[1],
  add = "median_iqr"
) + coord_cartesian(ylim = c(-0.5, 1.5)) + geom_hline(yintercept = 0, linewidth = 0.2) + geom_hline(
  yintercept = mean(results$deseq.lfc.pdc[results$nearest_G4 == "0kb"]),
  linetype = "dotted",
  linewidth = 0.2
) +
  stat_compare_means(label.y = 1.3,
                     label.x = 3,
                     size = 2)
ggsave(glue(
  "{plot_folder}Violin_CTCF_PhenDC3_lfc_by_predG4distance.pdf"
),
plot = last_plot())
compare_means(deseq.lfc.pdc ~ nearest_G4, results)
ggviolin(
  results,
  x = "nearest_G4",
  y = "deseq.lfc.pds",
  fill = "pro",
  palette = mypal[c(1, 3, 5)],
  add = "median_iqr"
) + coord_cartesian(ylim = c(-0.5, 1.5)) + geom_hline(yintercept = 0, linewidth = 0.2) + geom_hline(
  yintercept = mean(results$deseq.lfc.pds[results$nearest_G4 == "0kb"]),
  linetype = "dotted",
  linewidth = 0.2
) +
  stat_compare_means(aes(group = pro),
                     label.y = 1.3,
                     label.x = 3,
                     size = 1.5)
ggsave(glue("{plot_folder}Violin_CTCF_PDS_lfc_by_predG4distance_pro.pdf"), plot = last_plot())

```{ r } compare_means(deseq.lfc.pds ~ nearest_G4, results[results\(pro == "noPro", ]) compare_means(deseq.lfc.pds ~ nearest_G4, results[results\)pro == “noPro”, ]) %>% write_tsv(., “pds_G4distance_plots-noPro-statistics.tsv”)


<!-- rnb-text-end -->


<!-- rnb-chunk-begin -->


<!-- rnb-source-begin eyJkYXRhIjoiYGBgclxuY29tcGFyZV9tZWFucyhkZXNlcS5sZmMucGRzIH4gbmVhcmVzdF9HNCwgcmVzdWx0c1tyZXN1bHRzJHBybyA9PSBcIlByb1wiLCBdKSAlPiVcbiAgd3JpdGVfdHN2KC4sXG4gICAgICAgICAgICBnbHVlKFwie3N0YXRfb3V0cHV0fXBkc19HNGRpc3RhbmNlX3Bsb3RzLVByby1zdGF0aXN0aWNzLnRzdlwiKSlcblxuY29tcGFyZV9tZWFucyhkZXNlcS5sZmMucGRzIH4gcHJvLCBncm91cC5ieSA9IFwibmVhcmVzdF9HNFwiLCByZXN1bHRzKSAlPiVcbiAgd3JpdGVfdHN2KC4sXG4gICAgICAgICAgICBnbHVlKFxuICAgICAgICAgICAgICBcIntzdGF0X291dHB1dH1wZHNfRzRkaXN0YW5jZV9wbG90cy1iZXR3ZWVuX3Byb21fdHlwZXNfc3RhdHMudHN2XCJcbiAgICAgICAgICAgICkpXG5cbmNvbXBhcmVfbWVhbnMoZGVzZXEubGZjLnBkcyB+IG5lYXJlc3RfRzQsIGdyb3VwLmJ5ID0gXCJwcm9cIiwgcmVzdWx0cykgJT4lXG4gIHdyaXRlX3RzdiguLFxuICAgICAgICAgICAgZ2x1ZShcbiAgICAgICAgICAgICAgXCJ7c3RhdF9vdXRwdXR9cGRzX0c0ZGlzdGFuY2VfcGxvdHMtd2l0aGluX3Byb21fdHlwZXNfc3RhdHMudHN2XCJcbiAgICAgICAgICAgICkpXG5cbmBgYCJ9 -->

```r
compare_means(deseq.lfc.pds ~ nearest_G4, results[results$pro == "Pro", ]) %>%
  write_tsv(.,
            glue("{stat_output}pds_G4distance_plots-Pro-statistics.tsv"))

compare_means(deseq.lfc.pds ~ pro, group.by = "nearest_G4", results) %>%
  write_tsv(.,
            glue(
              "{stat_output}pds_G4distance_plots-between_prom_types_stats.tsv"
            ))

compare_means(deseq.lfc.pds ~ nearest_G4, group.by = "pro", results) %>%
  write_tsv(.,
            glue(
              "{stat_output}pds_G4distance_plots-within_prom_types_stats.tsv"
            ))
ggviolin(
  results,
  x = "nearest_G4",
  y = "deseq.lfc.pdc",
  fill = "pro",
  palette = mypal[c(1, 3, 5)],
  add = "median_iqr"
) + coord_cartesian(ylim = c(-0.5, 1.5)) + geom_hline(yintercept = 0, linewidth = 0.2) + geom_hline(
  yintercept = mean(results$deseq.lfc.pdc[results$nearest_G4 == "0kb"]),
  linetype = "dotted",
  linewidth = 0.2
) +
  stat_compare_means(aes(group = pro),
                     label.y = 1.3,
                     label.x = 3,
                     size = 1.5)
ggsave(glue(
  "{plot_folder}Violin_CTCF_PhenDC3_lfc_by_predG4distance_pro.pdf"
),
plot = last_plot())
compare_means(deseq.lfc.pdc ~ nearest_G4, results[results$pro == "noPro", ]) %>%
  write_tsv(.,
            glue(
              "{stat_output}phendc_G4distance_plots-noPro-statistics.tsv"
            ))

compare_means(deseq.lfc.pdc ~ pro, group.by = "nearest_G4", results) %>%
  write_tsv(.,
            glue(
              "{stat_output}phendc_G4distance_plots-between_prom_types_stats.tsv"
            ))

compare_means(deseq.lfc.pdc ~ nearest_G4, group.by = "pro", results) %>%
  write_tsv(.,
            glue(
              "{stat_output}phendc_G4distance_plots-within_prom_types_stats.tsv"
            ))
---
title: "R Notebook"
output: html_notebook
editor_options: 
  chunk_output_type: inline
---


```{r}
library("tidyverse")
library("data.table")
library("rtracklayer")
library("ggrastr")
library("glue")
library("DESeq2")
library("ggpubr")
library("wigglescout")
library("eulerr")
library("ggplot2")
library(corrplot)

# export 
result_folder = "../results/"
plot_folder = "../results/plots/"
stat_output = "../results/stats/"

combined_bigwigs <- list.files("../mendeley/Figure2","CTCF_ChIP-seq.+bw",
                  full.names = TRUE)
geo_bigwigs <- list.files("../mendeley/Figure2","GSM.+bw",
                  full.names = TRUE)

```

```{r}
# colors
mypal <-c("cornflowerblue","orange","red2","darkgreen","#505050")
mypal <-c("#00619D","#A82A34","orange","seagreen","#505050")
mypal2 <-c("#00619D","#7FB0CE","#A82A34","#D39499","orange","#FFD55A","seagreen","#7DBA9C","#505050")
```

```{r}
# diff signal function
bw_granges_diff_analysis <- function(granges_c1,
                                     granges_c2,
                                     label_c1,
                                     label_c2,
                                     estimate_size_factors = FALSE,
                                     as_granges = FALSE) {
  # Bind first, get numbers after
  names_values <- NULL
  fields <- names(mcols(granges_c1))
  
  if ("name" %in% fields) {
    names_values <- mcols(granges_c1)[["name"]]
    granges_c1 <- granges_c1[, fields[fields != "name"]]
  }
  
  fields <- names(mcols(granges_c2))
  if ("name" %in% fields) {
    granges_c2 <- granges_c2[, fields[fields != "name"]]
  }
  
  cts_df <- cbind(data.frame(granges_c1), mcols(granges_c2))
  
  if (!is.null(names_values)) {
    rownames(cts_df) <- names_values
  }
  
  # Needs to drop non-complete cases and match rows
  complete <- complete.cases(cts_df)
  cts_df <- cts_df[complete, ]
  
  values_df <- cts_df[, 6:ncol(cts_df)] %>% dplyr::select(where(is.numeric))
  cts <- get_nreads_columns(values_df)
  
  condition_labels <- c(rep(label_c1, length(mcols(granges_c1))), rep(label_c2, length(mcols(granges_c2))))
  
  
  coldata <- data.frame(colnames(cts), condition = as.factor(condition_labels))
  print(coldata)
  
  dds <- DESeq2::DESeqDataSetFromMatrix(
    countData = cts,
    colData = coldata,
    design = ~ condition,
    rowRanges = granges_c1[complete, ]
  )
  
  
  if (estimate_size_factors == TRUE) {
    dds <- DESeq2::estimateSizeFactors(dds)
  }
  else {
    # Since files are scaled, we do not want to estimate size factors
    sizeFactors(dds) <- c(rep(1, ncol(cts)))
  }
  
  dds <- DESeq2::estimateDispersions(dds)
  dds <- DESeq2::nbinomWaldTest(dds)
  
  if (as_granges) {
    result <- DESeq2::results(dds, format = "GRanges", alpha = 0.01)
    if (!is.null(names_values)) {
      result$name <- names_values[complete]
    }
    
  }
  else {
    # result <- results(dds, format="DataFrame")
    result <- dds
  }
  
  result
}

get_nreads_columns <- function(df, length_factor = 100) {
  # Convert mean coverages to round integer read numbers
  cts <- as.matrix(df)
  cts <- as.matrix(cts[complete.cases(cts), ])
  cts <- round(cts * length_factor)
  cts
}
```



```{r}
ctcf.and.G4.pro <- import("../peaks/G4 CTCF_with_promoters_sorted.bed")
ctcf.not.G4.pro <- import("../peaks/CTCF-only_with_promoters_sorted.bed")
ctcf.and.G4.npr <- import("../peaks/G4 CTCF_without_promoters_sorted.bed")
ctcf.not.G4.npr <- import("../peaks/CTCF-only_without_promoters_sorted.bed")

ctcf.and.G4.pro$class <- "CTCF_and_G4"
ctcf.not.G4.pro$class <- "CTCF_not_G4"
ctcf.and.G4.npr$class <- "CTCF_and_G4"
ctcf.not.G4.npr$class <- "CTCF_not_G4"
ctcf.and.G4.pro$pro <- "Pro"
ctcf.not.G4.pro$pro <- "Pro"
ctcf.and.G4.npr$pro <- "noPro"
ctcf.not.G4.npr$pro <- "noPro"

ctcf.and.G4 <- c(ctcf.and.G4.pro,ctcf.and.G4.npr)
ctcf.not.G4 <- c(ctcf.not.G4.pro,ctcf.not.G4.npr)
ctcf <- c(ctcf.and.G4,ctcf.not.G4)
ctcf <- sortSeqlevels(ctcf)
ctcf <- sort(ctcf)
names(ctcf) <- paste0(ctcf$class," ",ctcf$pro)
peaks_bed <- "../peaks/CTCF_G4_in_6_categories.bed"
export.bed(ctcf, peaks_bed)
```

### check MACS peak widths
```{r fig.width=3, fig.height=3}
ctcf.raw <- import("../peaks/CTCF_mES_peaks.narrowPeak")
G4.raw <- import("../peaks/G4_WT_peaks.narrowPeak")
ctcf.raw$class <- "CTCF"
G4.raw$class <- "G4"
ctcf.raw$pro <- "NA"
G4.raw$pro <- "NA"
df <- rbind(as.data.frame(ctcf.raw)[c(1,2,3,4,5,12,13)], as.data.frame(G4.raw)[c(1,2,3,4,5,12,13)],as.data.frame(ctcf,row.names = NULL))
ggdensity(df,x="width",y = "density",color = "class", fill="class", alpha=0.1, palette=mypal) + scale_x_continuous(limits = c(100,600))
```

```{r}
min(width(G4.raw))
min(width(ctcf.raw))
median(width(G4.raw))
median(width(ctcf.raw))
```
### calculate coverage over peaks
```{r}
# Wulfridge CTCF ChIP-Seq mocks
mocks_bigwigs = c("../data/CutNTag_ChIP-Seq/bw/SRR23958387_GSM7116277_E14_Mock_CTCF_Rep1_Mus_musculus_ChIP-Seq.CPMnorm.bw",
          "../data/CutNTag_ChIP-Seq/bw/SRR23958386_GSM7116278_E14_Mock_CTCF_Rep2_Mus_musculus_ChIP-Seq.CPMnorm.bw")

# PDS
# Wulfridge CTCF ChIP-Seq
pds_bigwigs = c("../data/CutNTag_ChIP-Seq/bw/SRR23958385_GSM7116279_E14_PDS_CTCF_Rep1_Mus_musculus_ChIP-Seq.CPMnorm.bw",
                "../data/CutNTag_ChIP-Seq/bw/SRR23958384_GSM7116280_E14_PDS_CTCF_Rep2_Mus_musculus_ChIP-Seq.CPMnorm.bw")

pdc_bigwigs = c("../data/CutNTag_ChIP-Seq/bw/SRR23958383_GSM7116281_E14_PhenDC3_CTCF_Rep1_Mus_musculus_ChIP-Seq.CPMnorm.bw",
                "../data/CutNTag_ChIP-Seq/bw/SRR23958382_GSM7116282_E14_PhenDC3_CTCF_Rep2_Mus_musculus_ChIP-Seq.CPMnorm.bw")

G4_bigwigs = c("../mendeley/Figure1/G4_CUT-Tag_thisStudy.bw")

cov.mocks <- bw_loci(mocks_bigwigs,peaks_bed,labels=c("mock_1","mock_2"))
cov.pds <- bw_loci(pds_bigwigs,peaks_bed,labels=c("PDS_1","PDS_2"))
cov.pdc <- bw_loci(pdc_bigwigs,peaks_bed,labels=c("PhenDC3_1","PhenDC3_2"))
cov.G4 <- bw_loci(G4_bigwigs,peaks_bed,labels=c("G4_NT"))
cov.combined <- bw_loci(combined_bigwigs,peaks_bed,labels=c("mock","PDS","PhenDC3"))
cov.geo <- bw_loci(geo_bigwigs,peaks_bed,labels=c("GEO_mock","GEO_PDS","GEO_PhenDC3"))
```

### quick comparison on deposited and reprocessed bigwig files
```{r fig.width=10, fig.height=10}
corr_df <- data.frame(data.frame(cov.combined)[,c(9,6,7,8)],data.frame(cov.geo)[,6:8],data.frame(cov.mocks)[,6:7],data.frame(cov.pds)[,6:7],data.frame(cov.pdc)[,6:7])
res <- cor(corr_df[,2:13])
corrplot(res, type = "upper", order = "hclust",tl.col = "black", tl.srt = 45,addCoef.col = 'black',tl.pos = 'd',    cl.pos = 'n', col.lim=c(0.93, 1),is.corr = F, method = 'color')
```
```{r fig.height=3, fig.width=9}
p1 <- plot_bw_profile(c(mocks_bigwigs[1],pds_bigwigs[1],pdc_bigwigs[1]),loci = "../peaks/G4 CTCF_with_promoters_sorted.bed",mode = "center",verbose = F) + coord_cartesian(ylim=c(0,2.4))
p2 <- plot_bw_profile(c(mocks_bigwigs[2],pds_bigwigs[2],pdc_bigwigs[2]),loci = "../peaks/G4 CTCF_with_promoters_sorted.bed",mode = "center",verbose = F) + coord_cartesian(ylim=c(0,2.4))
p3 <- plot_bw_profile(bwfiles = combined_bigwigs,loci = "../peaks/G4 CTCF_with_promoters_sorted.bed",mode = "center",verbose = F) + coord_cartesian(ylim=c(0,2.4))
ggarrange(p1,p2,p3, ncol=3)
```

```{r fig.height=3, fig.width=6}
p1 <- plot_bw_profile(bwfiles = geo_bigwigs,loci = "../peaks/G4 CTCF_with_promoters_sorted.bed",mode = "center",verbose = F) + coord_cartesian(ylim=c(0,5))
p2 <- plot_bw_profile(bwfiles = combined_bigwigs,loci = "../peaks/G4 CTCF_with_promoters_sorted.bed",mode = "center",verbose = F) + coord_cartesian(ylim=c(0,2.6))
ggarrange(p1,p2, ncol=2)
```
```{r fig.height=3, fig.width=6}
p1 <- plot_bw_profile(bwfiles = geo_bigwigs,loci = "../peaks/CTCF-only_with_promoters_sorted.bed",mode = "center",verbose = F) + coord_cartesian(ylim=c(0,5))
p2 <- plot_bw_profile(bwfiles = combined_bigwigs,loci = "../peaks/CTCF-only_with_promoters_sorted.bed",mode = "center",verbose = F) + coord_cartesian(ylim=c(0,2.6))
ggarrange(p1,p2, ncol=2)
```
```{r fig.height=3, fig.width=9}
p1 <- plot_bw_loci_scatter(mocks_bigwigs[1],mocks_bigwigs[2], loci = "../peaks/G4 CTCF_with_promoters_sorted.bed", verbose = F)
p2 <- plot_bw_loci_scatter(pds_bigwigs[1],pds_bigwigs[2], loci = "../peaks/G4 CTCF_with_promoters_sorted.bed", verbose = F)
p3 <- plot_bw_loci_scatter(pdc_bigwigs[1],pdc_bigwigs[2], loci = "../peaks/G4 CTCF_with_promoters_sorted.bed", verbose = F)
ggarrange(p1,p2,p3, ncol=3)
```

```{r fig.height=3, fig.width=9}
p1 <- plot_bw_loci_scatter(combined_bigwigs[1],combined_bigwigs[2], loci = "../peaks/G4 CTCF_with_promoters_sorted.bed", verbose = F)
p2 <- plot_bw_loci_scatter(combined_bigwigs[2],combined_bigwigs[3], loci = "../peaks/G4 CTCF_with_promoters_sorted.bed", verbose = F)
p3 <- plot_bw_loci_scatter(combined_bigwigs[1],combined_bigwigs[3], loci = "../peaks/G4 CTCF_with_promoters_sorted.bed", verbose = F)
ggarrange(p1,p2,p3, ncol=3)
```


```{r}
results <- data.frame(
  as.data.frame(cov.mocks),
  as.data.frame(cov.pds)[6:7],
  as.data.frame(cov.pdc)[6:7],
  as.data.frame(cov.G4)[6],
  raw.lfc.pds_1 = log2(cov.pds$PDS_1 / cov.mocks$mock_1),
  raw.lfc.pds_2 = log2(cov.pds$PDS_2 / cov.mocks$mock_2),
  raw.lfc.pdc_1 = log2(cov.pds$PDS_1 / cov.mocks$mock_1),
  raw.lfc.pdc_2 = log2(cov.pdc$PhenDC3_2 / cov.mocks$mock_2),
  raw.lfc.pds = log2(rowMeans(as.data.frame(cov.pds)[6:7]) / rowMeans(as.data.frame(cov.mocks)[6:7])),
  raw.lfc.pdc = log2(rowMeans(as.data.frame(cov.pdc)[6:7]) / rowMeans(as.data.frame(cov.mocks)[6:7])),
  mean.mock = rowMeans(as.data.frame(cov.mocks)[6:7]),
  mean.pds = rowMeans(as.data.frame(cov.pds)[6:7]),
  mean.pdc = rowMeans(as.data.frame(cov.pdc)[6:7])
)

cov.mocks$name <- NULL
cov.pds$name <- NULL
cov.pdc$name <- NULL

de_pds <- bw_granges_diff_analysis(cov.mocks, cov.pds, "Mock", "PDS")
lfc_pds = DESeq2::lfcShrink(de_pds, coef = "condition_PDS_vs_Mock", type = "apeglm")

de_pdc <- bw_granges_diff_analysis(cov.mocks, cov.pdc, "Mock", "PhenDC3")
lfc_pdc = DESeq2::lfcShrink(de_pdc, coef = "condition_PhenDC3_vs_Mock", type = "apeglm")

results$deseq.lfc.pds <- results(de_pds)$log2FoldChange
results$deseq.lfcs.pds <- lfc_pds$log2FoldChange
results$deseq.padj.pds <- lfc_pds$padj
results$deseq.mean.pds <- log2(lfc_pds$baseMean)
results$deseq.sig.pds <- lfc_pds$pvalue < 0.05

results$deseq.lfc.pdc <- results(de_pdc)$log2FoldChange
results$deseq.lfcs.pdc <- lfc_pdc$log2FoldChange
results$deseq.padj.pdc <- lfc_pdc$padj
results$deseq.mean.pdc <- log2(lfc_pdc$baseMean)
results$deseq.sig.pdc <- lfc_pdc$pvalue < 0.05

results$class <- gsub(" .+", "", results$name)
results$pro <- gsub(".+ ", "", results$name)
results$pro <- factor(results$pro, levels = c("Pro", "noPro"))
results$class <- factor(results$class, levels = c("CTCF_and_G4", "CTCF_not_G4"))

results$log2.mock <- log2(results$mean.mock)
results$log2.pds <- log2(results$mean.pds)
results$log2.pdc <- log2(results$mean.pdc)
results$log2.G4 <- log2(results$G4_NT)

results$deseq.sigup.pds <- results$deseq.sig.pds &
  results$deseq.lfc.pds > 0
results$deseq.sigup.pdc <- results$deseq.sig.pdc &
  results$deseq.lfc.pdc > 0
```

```{r}
write.table(results, glue("{result_folder}foldchange_results.txt"))
table(results$name)
```

```{r}
#results <- read.table("foldchange_results.txt")
results$class <- factor(results$class, levels = c("CTCF_and_G4", "CTCF_not_G4"))
```


```{r fig.width=3, fig.height=3}
p <- ggviolin(
  results,
  x = "class",
  y = "mean.mock",
  fill = "class",
  palette = mypal,
  add = "median_iqr"
) + coord_cartesian(ylim = c(0, 10))
annotate_figure(p, fig.lab = "CTCF signal by class in mock condition, mean of two reps, median+iqr", fig.lab.size = 6)
ggsave(glue("{plot_folder}Violin_CTCF_classes.pdf"), last_plot())
```

```{r fig.width=3, fig.height=3}
p <- ggviolin(
  results,
  x = "class",
  y = "mean.mock",
  fill = "pro",
  palette = mypal,
  add = "median_iqr"
) + coord_cartesian(ylim = c(0, 10))
annotate_figure(p, fig.lab = "CTCF signal by class in mock condition, mean of two reps, median+iqr", fig.lab.size = 6)
ggsave(glue("{plot_folder}Violin_CTCF_classes_pro.pdf"),
       last_plot())
```

```{r fig.width=5, fig.height=3}
mdf <- reshape2::melt(dplyr::select(
  results,
  c(
    "class",
    "mock_1",
    "mock_2",
    "PDS_1",
    "PDS_2",
    "PhenDC3_1",
    "PhenDC3_2"
  )
))
ggboxplot(
  mdf,
  x = "variable",
  y = "value",
  fill = "class",
  palette = mypal
) + coord_cartesian(ylim = c(0, 10))
ggsave(glue("{plot_folder}Boxplot_CTCF_reps.pdf"), last_plot())
```

```{r fig.width=5, fig.height=3}
mdf <- reshape2::melt(dplyr::select(
  results,
  c(
    "class",
    "mock_1",
    "mock_2",
    "PDS_1",
    "PDS_2",
    "PhenDC3_1",
    "PhenDC3_2"
  )
))

mdf_stats_class = compare_means(value ~ class, group.by = "variable", data = mdf)

ggviolin(
  mdf,
  x = "variable",
  y = "value",
  fill = "class",
  palette = mypal,
  add = "median_iqr"
) + coord_cartesian(ylim = c(0, 10))
ggsave(glue("{plot_folder}Violin_CTCF_reps.pdf"), last_plot())
```
```{r fig.width=3, fig.height=3}
mdf <- reshape2::melt(dplyr::select(results, c(
  "class", "deseq.lfc.pds", "deseq.lfc.pdc"
)))
p <- ggviolin(
  mdf,
  x = "variable",
  y = "value",
  fill = "class",
  palette = mypal,
  add = "median_iqr"
) + geom_hline(yintercept = 0, linetype = "dotted") + coord_cartesian(ylim =
                                                                        c(-3, 3)) + stat_compare_means(aes(group = class), label.y = 3, size = 2)

annotate_figure(p, fig.lab = "DESEq foldchange mean treat vs mean mock, median+iqr", fig.lab.size = 6)
ggsave(
  glue("{plot_folder}Violin_CTCF_lfc.pdf"),
  last_plot(),
  width = 5,
  height = 5
)
```
```{r fig.width=3, fig.height=3}
p <- ggboxplot(
  mdf,
  x = "variable",
  y = "value",
  fill = "class",
  palette = mypal
) + geom_hline(yintercept = 0, linetype = "dotted") + coord_cartesian(ylim =
                                                                        c(-2, 2))
annotate_figure(p, fig.lab = "DESEq foldchange mean treat vs mean mock, median+iqr", fig.lab.size = 6)
ggsave(glue("{plot_folder}Boxplot_CTCF_lfc.pdf"), last_plot())
```

```{r fig.width=5, fig.height=6}
mdf <- reshape2::melt(dplyr::select(results, c(
  "pro", "class", "deseq.lfc.pds", "deseq.lfc.pdc"
)))
mdf$pro <- factor(mdf$pro, levels = c("Pro", "noPro"))
mdf$x <- as.factor(paste0(mdf$class, " ", mdf$variable))
mdf$x <- factor(mdf$x, levels = levels(mdf$x)[c(2, 1, 4, 3)])
p <- ggviolin(
  mdf,
  x = "x",
  y = "value",
  fill = "class",
  palette = mypal,
  add = "median_iqr",
  facet.by = "pro"
) + geom_hline(yintercept = 0, linetype = "dotted") + coord_cartesian(ylim =
                                                                        c(-2, 2)) + theme(axis.text.x = element_text(angle = 90, hjust = 1))
annotate_figure(p, fig.lab = "DESEq foldchange mean treat vs mean mock, median+iqr", fig.lab.size = 6)
ggsave(glue("{plot_folder}Violin_CTCF_lfc_pro.pdf"), last_plot())
```

```{r}
med <- aggregate(deseq.lfc.pds ~ class,
                 data = results,
                 FUN = "median",
                 na.rm = T)
med$deseq.fc.pds <- 2 ^ med$deseq.lfc.pds
med
```

```{r}
med <- aggregate(deseq.lfc.pds ~ name,
                 data = results,
                 FUN = "median",
                 na.rm = T)
med$deseq.fc.pds <- 2 ^ med$deseq.lfc.pds
med
```
```{r}
med <- aggregate(deseq.lfc.pds ~ class,
                 data = results,
                 FUN = "median",
                 na.rm = T)
med$deseq.fc.pds <- 2 ^ med$deseq.lfc.pds
med
```
```{r}
med <- aggregate(deseq.lfc.pdc ~ name,
                 data = results,
                 FUN = "median",
                 na.rm = T)
med$deseq.fc.pdc <- 2 ^ med$deseq.lfc.pdc
med
```

```{r}
deseq_lfc_stats = compare_means(deseq.lfc.pds ~ class, data = results)
deseq_lfc_stats
write_tsv(deseq_lfc_stats,
          glue("{stat_output}pds-deseq2_lfc_statistics.tsv"))

```

```{r}
deseq_lfc_stats = compare_means(deseq.lfc.pdc ~ class, data = results)
deseq_lfc_stats
write_tsv(deseq_lfc_stats,
          glue("{stat_output}phendc-deseq2_lfc_statistics.tsv"))
```

```{r fig.width=5, fig.height=3}
mdf <- reshape2::melt(dplyr::select(
  results,
  c(
    "class",
    "raw.lfc.pds_1",
    "raw.lfc.pds_2",
    "raw.lfc.pdc_1",
    "raw.lfc.pdc_2"
  )
))
ggviolin(
  mdf,
  x = "variable",
  y = "value",
  fill = "class",
  palette = mypal,
  add = "median_iqr"
) + geom_hline(yintercept = 0, linetype = "dotted") + coord_cartesian(ylim =
                                                                        c(-3, 3)) +
  stat_compare_means(aes(group = class), label.y = 3.6, size = 2)
ggsave(glue("{plot_folder}Violin_CTCF_lfc_individual_reps.pdf"), last_plot())
```

```{r fig.width=3, fig.height=3}
ggdensity(
  results,
  x = "raw.lfc.pds",
  color = "class",
  fill = "class",
  palette = mypal
) + geom_vline(xintercept = 0, linetype = "dotted")
```

```{r fig.width=3, fig.height=3}
ggdensity(
  results,
  x = "raw.lfc.pdc",
  color = "class",
  fill = "class",
  palette = mypal
) + geom_vline(xintercept = 0, linetype = "dotted")
```


```{r fig.width=3, fig.height=3}
ggscatter(
  results,
  x = "log2.mock",
  y = "log2.pds",
  size = 1,shape=20,
  alpha = 0.1,
  color = "deseq.sig.pds",
  palette = mypal[c(5, 2)]
)
ggsave(glue("{plot_folder}Scatter_CTCF_DESEq_PDSvsMock.pdf"), rasterize(last_plot(),dpi = 600),width = 3, height= 3.2)
```
```{r fig.width=4, fig.height=4}
ggscatter(
  results,
  x = "log2.mock",
  y = "log2.pdc",
  size = 1,shape=20,
  alpha = 0.1,
  color = "deseq.sig.pdc",
  palette = mypal[c(5, 2)]
)
ggsave(glue("{plot_folder}Scatter_CTCF_DESEq_PhenDC3vsMock.pdf"), rasterize(last_plot(),dpi = 600),width = 3, height= 3.2)
```
```{r}
tb <- table(results$class)
deseq.stats <- as.data.frame(t(as.matrix(tb)))
deseq.stats[1,] <- colSums(deseq.stats)
rownames(deseq.stats) <- c("Total")
deseq.stats.percent <- deseq.stats
deseq.stats.percent[1,] <- c(100,100)
tb
```

```{r}
tb <- table(results$deseq.sig.pds &
        (results$deseq.lfc.pds > 0),
      results$class)
deseq.stats <- rbind(deseq.stats, PDS.sig.up = as.data.frame.matrix(tb)[2,])
tb
```
```{r}
tb <- prop.table(table(results$deseq.sig.pds &
        (results$deseq.lfc.pds > 0),
      results$class),margin = 2)*100
deseq.stats.percent <- rbind(deseq.stats.percent, PDS.sig.up  = as.data.frame.matrix(tb)[2,])
tb
```

```{r fig.width=3, fig.height=2.5}
table(results$deseq.sig.pds &
        (results$deseq.lfc.pds > 0),
      results$class)
mdf <- reshape2::melt(table(
  results$deseq.sig.pds & (results$deseq.lfc.pds > 0),
  results$class
))
ggplot(mdf, aes(Var1, Var2, fill = value)) +
  geom_tile(show.legend = F) + geom_text(aes(label = value)) +
  scale_fill_gradient(low = "white", high = "orange") + theme_minimal()
```

```{r}
vl <- list(
  sig = grep("TRUE", results$deseq.sigup.pds),
  CTCF_G4 = grep("and", results$class),
  CTCFonly = grep("not", results$class)
)
plot(euler(vl), quantities = T)
```

```{r}
tb <- table(results$deseq.sig.pdc &
        (results$deseq.lfc.pdc > 0),
      results$class)
deseq.stats <- rbind(deseq.stats, PhenDC3.sig.up = as.data.frame.matrix(tb)[2,])
tb
```

```{r}
tb <- prop.table(table(results$deseq.sig.pdc &
        (results$deseq.lfc.pdc > 0),
      results$class),margin = 2)*100
deseq.stats.percent <- rbind(deseq.stats.percent, PhenDC3.sig.up = as.data.frame.matrix(tb)[2,])
tb
```


```{r fig.width=3, fig.height=2.5}
table(results$deseq.sig.pdc &
        (results$deseq.lfc.pdc > 0),
      results$class)
mdf <- reshape2::melt(table(
  results$deseq.sig.pdc & (results$deseq.lfc.pdc > 0),
  results$class
))
ggplot(mdf, aes(Var1, Var2, fill = value)) +
  geom_tile(show.legend = F) + geom_text(aes(label = value)) +
  scale_fill_gradient(low = "white", high = "orange") + theme_minimal()
```


```{r}
results$uid <- seq(1:nrow(results))
vl <- list(
  sig = grep("TRUE", results$deseq.sigup.pdc),
  CTCF_G4 = grep("and", results$class),
  CTCFonly = grep("not", results$class)
)
plot(euler(vl), quantities = T)
```
 
```{r fig.width=3, fig.height=2.5}
table(results$deseq.sigup.pds, results$deseq.sigup.pdc)
mdf <- reshape2::melt(table(results$deseq.sigup.pds, results$deseq.sigup.pdc))
ggplot(mdf, aes(Var1, Var2, fill = value)) +
  geom_tile(show.legend = F) + geom_text(aes(label = value)) +
  scale_fill_gradient(low = "white", high = "orange") + theme_minimal()
```
```{r}
tb <- table(results$deseq.sigup.pds & results$deseq.sigup.pdc,
      results$class)
deseq.stats <- rbind(deseq.stats, both.sig.up = as.data.frame.matrix(tb)[2,])
tb
```

```{r}
tb <- prop.table(table(results$deseq.sigup.pds & results$deseq.sigup.pdc,
      results$class),margin = 2)*100
deseq.stats.percent <- rbind(deseq.stats.percent, both = as.data.frame.matrix(tb)[2,])
tb
```

```{r fig.width=3, fig.height=2.5}
table(results$deseq.sigup.pds & results$deseq.sigup.pdc, results$class)
mdf <- reshape2::melt(table(results$deseq.sigup.pds & results$deseq.sigup.pdc, results$class))
ggplot(mdf, aes(Var1, Var2, fill = value)) +
  geom_tile(show.legend = F) + geom_text(aes(label = value)) +
  scale_fill_gradient(low = "white", high = "orange") + theme_minimal()
```

```{r fig.width=2.5, fig.height=2.5}
deseq.stats$class <- rownames(deseq.stats) 
mdf <- reshape2::melt(deseq.stats)
ggplot(mdf, aes(variable, class, fill = value)) +
  geom_tile(show.legend = F) + geom_text(aes(label = value)) +
  scale_fill_gradient(low = "white", high = "orange") + theme_minimal()
ggsave(glue("{plot_folder}Table_CTCF_DESEq_Sigup.pdf"), rasterize(last_plot(),dpi = 600),width = 2.5, height= 2.5)
```
```{r fig.width=2.5, fig.height=2.5}
deseq.stats.percent$class <- rownames(deseq.stats.percent) 
mdf <- reshape2::melt(deseq.stats.percent[-1,])
ggplot(mdf, aes(variable, class, fill = value)) +
  geom_tile(show.legend = F) + geom_text(aes(label = round(mdf$value,2))) +
  scale_fill_gradient(low = "white", high = "orange") + theme_minimal()
ggsave(glue("{plot_folder}Table_CTCF_DESEq_Sigup_percent.pdf"), rasterize(last_plot(),dpi = 600),width = 2.5, height= 2.5)
```

```{r}
vl <- list(
  sig_PDS = grep("TRUE", results$deseq.sigup.pds),
  sig_PDC = grep("TRUE", results$deseq.sigup.pdc),
  CTCF_G4 = grep("and", results$class),
  CTCFonly = grep("not", results$class)
)
plot(euler(vl), quantities = T)
```

```{r}
results$uid <- seq(1:nrow(results))
vl <- list(
  sig_PDS = grep("TRUE", results$deseq.sigup.pds),
  sig_PDC = grep("TRUE", results$deseq.sigup.pdc),
  CTCF_G4 = grep("and", results$class)
)
plot(euler(vl), quantities = T)
```
```{r fig.width=6, fig.height=6}
results$sig.by.class.pds <- paste0(results$deseq.sigup.pds, "_", results$class)
results$psize <- 0.01
results$psize[results$deseq.sigup.pds &
                results$class == "CTCF_and_G4"] <- 1

ggscatter(
  results,
  x = "log2.mock",
  y = "log2.pds",
  size = 0.5,
  alpha = results$psize,
  color = "sig.by.class.pds",
  palette = c(mypal[5], mypal[5], mypal[5], mypal[2], mypal[1])
)
```
```{r fig.width=6, fig.height=6}
ggscatter(
  results[!grepl("NA", results$sig.by.class.pds), ],
  x = "log2.mock",
  y = "deseq.lfc.pds",
  size = 0.2,
  alpha = "psize",
  color = "sig.by.class.pds",
  palette = c("#505050", "#505050", "red2", "blue")
)
```

```{r fig.width=5, fig.height=5}
ggscatterhist(
  results[!grepl("NA", results$sig.by.class.pds), ],
  x = "log2.mock",
  y = "log2.pds",
  size = 0.4,
  alpha = "psize",
  color = "sig.by.class.pds",
  margin.params = list(
    fill = "sig.by.class.pds",
    color = "black",
    size = 0.2
  ),
  palette = c("#505050", "#505050", "red2", "blue")
)
```

```{r fig.width=3, fig.height=3}
ggscatter(
  results,
  x = "raw.lfc.pds",
  y = "log2.G4",
  size = 0.2,
  alpha = 0.1,
  color = "deseq.sig.pds",
  cor.coef = T,
  palette = c("#888888",mypal[2])
)
ggsave(glue("{plot_folder}Scatter_G4_vs_LFC_PDS.pdf"),
       plot = rasterize(last_plot()))
```
```{r fig.width=3, fig.height=3}
ggscatter(
  results,
  x = "raw.lfc.pdc",
  y = "log2.G4",
  size = 0.2,
  alpha = 0.1,
  color = "deseq.sig.pdc",
  cor.coef = T,
  palette = c("#888888",mypal[2])
)
ggsave(glue("{plot_folder}Scatter_G4_vs_LFC_PDC.pdf"),
       plot = rasterize(last_plot()))
```

```{r fig.width=3, fig.height=3}
results$G4.quantile <- dplyr::ntile(results$G4_NT, n = 5)
results$pds.quantile <- dplyr::ntile(results$mean.pds-results$mean.mock, n = 4)
results$pdc.quantile <- dplyr::ntile(results$mean.pdc-results$mean.mock, n = 4)
```


```{r fig.width=4, fig.height=3}
tb <- table(results$pds.quantile, results$class)
tb
ggbarplot(as.data.frame(t(as.matrix(tb))),"Var2","Freq",fill="Var1",palette = c(mypal[1],"#aaaaaa"),label = TRUE, lab.pos="in",ylim=c(11000,13000))
ggsave(glue("{plot_folder}Barplot_peakCat_by_PDSquantile.pdf"),
       plot = last_plot())
```

```{r fig.width=3, fig.height=3}
ggviolin(
  results,
  x = "pds.quantile",
  y = "deseq.lfc.pds",
  fill = mypal[3],
  add = "median_iqr"
) + coord_cartesian(ylim = c(-1.5, 1.5)) + geom_hline(yintercept = 0, linetype =
                                                    "dotted")
ggsave(glue("{plot_folder}Violin_lfcPDS_by_PDSquantile.pdf"),
       plot = last_plot())
```

```{r fig.width=6, fig.height=3}
top25.pds <- makeGRangesFromDataFrame(results[results$pds.quantile==4,1:3],na.rm=T)
bot75.pds <- makeGRangesFromDataFrame(results[results$pds.quantile!=4,1:3],na.rm=T)
p1 <- plot_bw_profile(bwfiles = geo_bigwigs,loci = top25.pds, mode = "center",verbose = F, colors = mypal) + coord_cartesian(ylim=c(0,6))
p2 <- plot_bw_profile(bwfiles = geo_bigwigs,loci = bot75.pds, mode = "center",verbose = F, colors = mypal) + coord_cartesian(ylim=c(0,6))
ggarrange(p1,p2, ncol=2)
ggsave(glue("{plot_folder}Profile_CTCF_top25_PDSquantile.pdf"),
       plot = last_plot())
```
```{r fig.width=6, fig.height=3}
top25.pds.CTCF_G4 <- makeGRangesFromDataFrame(results[(results$pds.quantile==4 & results$class == "CTCF_and_G4"),1:3],na.rm=T)
top25.pds.CTCF_only <- makeGRangesFromDataFrame(results[(results$pds.quantile==4  &  results$class == "CTCF_not_G4"),1:3],na.rm=T)

p1 <- plot_bw_profile(bwfiles = geo_bigwigs[1:2],loci = top25.pds.CTCF_G4, mode = "center",verbose = F, colors = mypal[c(1,2)]) + coord_cartesian(ylim=c(0,10))
p2 <- plot_bw_profile(bwfiles = geo_bigwigs[1:2],loci = top25.pds.CTCF_only, mode = "center",verbose = F, colors = mypal[c(1,2)]) + coord_cartesian(ylim=c(0,10))
ggarrange(p1,p2, ncol=2)
ggsave(glue("{plot_folder}Profile_CTCF_top25_PDSquantile_byG4overlap.pdf"),
       plot = last_plot())
```

```{r fig.width=6, fig.height=3}
top25.pds.CTCF_G4 <- makeGRangesFromDataFrame(results[(results$pds.quantile==4 & results$class == "CTCF_and_G4"),1:3],na.rm=T)
top25.pds.CTCF_only <- makeGRangesFromDataFrame(results[(results$pds.quantile==4  &  results$class == "CTCF_not_G4"),1:3],na.rm=T)

p1 <- plot_bw_profile(bwfiles = geo_bigwigs[2],bg_bwfiles = geo_bigwigs[1], loci = top25.pds.CTCF_G4, mode = "center",verbose = F, colors = mypal[2],norm_mode = "log2fc",show_error = T) + coord_cartesian(ylim=c(-1,1))
p2 <- plot_bw_profile(bwfiles = geo_bigwigs[2],bg_bwfiles = geo_bigwigs[1], loci = top25.pds.CTCF_only, mode = "center",verbose = F, colors = mypal[2],norm_mode = "log2fc",show_error = T) + coord_cartesian(ylim=c(-1,1))
ggarrange(p1,p2, ncol=2)
ggsave(glue("{plot_folder}Profile_CTCFfoldchange_top25_PDSquantile_byG4.pdf"),
       plot = last_plot())
```

```{r fig.width=6, fig.height=3}
top25.pds.CTCF_G4 <- makeGRangesFromDataFrame(results[(results$pds.quantile==4 & results$class == "CTCF_and_G4"),1:3],na.rm=T)
top25.pds.CTCF_only <- makeGRangesFromDataFrame(results[(results$pds.quantile==4  &  results$class == "CTCF_not_G4"),1:3],na.rm=T)

p1 <- plot_bw_profile(bwfiles = G4_bigwigs, loci = top25.pds.CTCF_G4, mode = "center",verbose = F, colors = mypal[3],norm_mode = "log2fc") + coord_cartesian(ylim=c(0,10))
p2 <- plot_bw_profile(bwfiles = G4_bigwigs, loci = top25.pds.CTCF_only, mode = "center",verbose = F, colors = mypal[3],norm_mode = "log2fc") + coord_cartesian(ylim=c(0,10))
ggarrange(p1,p2, ncol=2)
```


```{r fig.width=3, fig.height=3}
ord.top25 <- order(rowMeans(as.data.frame(bw_heatmap(bwfiles = geo_bigwigs[1],loci = top25.pds, mode = "center"))))
p1 <- plot_bw_heatmap(bwfiles = geo_bigwigs[1],loci = top25.pds, mode = "center",verbose = F, zmax = 10, max_rows_allowed = 50, order_by = ord.top25)
p2 <- plot_bw_heatmap(bwfiles = geo_bigwigs[2],loci = top25.pds, mode = "center",verbose = F, zmax = 10, max_rows_allowed = 50, order_by = ord.top25)
ggarrange(p1,p2, ncol=2, common.legend = T)
ggsave(glue("{plot_folder}Heatmap_CTCF_top25_PDSquantile.pdf"),
       plot = last_plot())
```

```{r fig.width=3, fig.height=4}
ord.bot75 <- order(rowMeans(as.data.frame(bw_heatmap(bwfiles = geo_bigwigs[1],loci = bot75.pds, mode = "center"))))
p1 <- plot_bw_heatmap(bwfiles = geo_bigwigs[1],loci = bot75.pds, mode = "center",verbose = F, zmax = 10, max_rows_allowed = 200, order_by = ord.bot75)
p2 <- plot_bw_heatmap(bwfiles = geo_bigwigs[2],loci = bot75.pds, mode = "center",verbose = F, zmax = 10, max_rows_allowed = 200, order_by = ord.bot75)
ggarrange(p1,p2, ncol=2, common.legend = T)
ggsave(glue("{plot_folder}Heatmap_CTCF_bot75_PDSquantile.pdf"),
       plot = last_plot())
```

```{r fig.width=3, fig.height=3}
G4_CnR <- bw_loci(c("../mendeley/FigureS2/GSM6634325_E14_Mock_G4.bw","../mendeley/FigureS2/GSM6634326_E14_PDS_G4.bw"),loci = makeGRangesFromDataFrame(results,na.rm=T),labels = c("Mock","PDS"))
results$G4_CnR_delta <- G4_CnR$PDS-G4_CnR$Mock
results$G4_CnR_lfc <- log2(G4_CnR$PDS/G4_CnR$Mock)
results$G4_CnR_mock <- G4_CnR$Mock
```

```{r fig.width=3, fig.height=2}
ggviolin(
  results,
  x = "pds.quantile",
  y = "G4_CnR_delta",
  fill = mypal[2],
  add = "median_iqr"
) + coord_cartesian(ylim = c(-10, 15)) + geom_hline(yintercept = 0, linetype =
                                                    "dotted")
ggsave(glue("{plot_folder}Violin_G4_CnR_delta_by_PDSquantile.pdf"),
       plot = last_plot())
```
```{r}
compare_means(G4_CnR_delta ~ pds.quantile, results)
```
```{r}
results$pds.quartile.top25 <- results$pds.quantile == 4
mean(na.omit(results$G4_CnR_delta))
mean(results$G4_CnR_delta[results$pds.quartile.top25])
mean(results$G4_CnR_delta[!results$pds.quartile.top25])
compare_means(G4_CnR_delta ~ pds.quartile.top25, results)
```


```{r}
export.bed(top25.pds,"../peaks/CTCF_peaks_top25_pds_up.bed")
export.bed(bot75.pds,"../peaks/CTCF_peaks_bot75_pds_up.bed")
```

```{r fig.width=3, fig.height=3}
ggviolin(
  results,
  x = "G4.quantile",
  y = "deseq.lfc.pds",
  fill = mypal[1],
  add = "median_iqr"
) + coord_cartesian(ylim = c(-2, 2)) + geom_hline(yintercept = 0, linetype =
                                                    "dotted")
ggsave(glue("{plot_folder}Violin_CTCF_lfc_by_G4quantile.pdf"),
       plot = last_plot())
```



```{r fig.width=3, fig.height=3}
ggviolin(
  results,
  x = "G4.quantile",
  y = "log2.G4",
  fill = mypal2[5],
  add = "median_iqr"
) + geom_hline(yintercept = 0, linetype = "dotted")
ggsave(glue("{plot_folder}Violin_G4_by_G4quantile.pdf"), plot = last_plot())
```

```{r fig.width=3, fig.height=3}
peak_cats <- bedscout::import_named_bed_into_list(peaks_bed)
plot_bw_profile(
  G4_bigwigs,
  peak_cats,
  labels = c(
    peak_cats[[1]][1, ]$name,
    peak_cats[[2]][1, ]$name,
    peak_cats[[3]][1, ]$name,
    peak_cats[[4]][1, ]$name
  ),
  mode = "center",
  show_error = T,
  verbose = F,
  remove_top = 0.001,
  colors = mypal
)
```

```{r fig.width=3, fig.height=3}
plot_bw_profile(
  combined_bigwigs[1],
  peak_cats,
  labels = c(
    peak_cats[[1]][1, ]$name,
    peak_cats[[2]][1, ]$name,
    peak_cats[[3]][1, ]$name,
    peak_cats[[4]][1, ]$name
  ),
  mode = "center",
  show_error = T,
  verbose = F,
  remove_top = 0.001,
  colors = mypal
)
```




```{r fig.width=12, fig.height=3}
p1 <- plot_bw_profile(
  c(mocks_bigwigs, pds_bigwigs),
  peak_cats[[1]],
  labels = c("mock1", "mock2", "trt1", "rtr2"),
  mode = "center",
  show_error = T,
  verbose = F,
  remove_top = 0.001,
  colors = mypal2[c(9, 10, 5, 6)],
  upstream = 1500,
  downstream = 1500
)
p2 <- plot_bw_profile(
  c(mocks_bigwigs, pds_bigwigs),
  peak_cats[[2]],
  labels = c("mock1", "mock2", "trt1", "rtr2"),
  mode = "center",
  show_error = T,
  verbose = F,
  remove_top = 0.001,
  colors = mypal2[c(9, 10, 5, 6)],
  upstream = 1500,
  downstream = 1500
)
p3 <- plot_bw_profile(
  c(mocks_bigwigs, pds_bigwigs),
  peak_cats[[3]],
  labels = c("mock1", "mock2", "trt1", "rtr2"),
  mode = "center",
  show_error = T,
  verbose = F,
  remove_top = 0.001,
  colors = mypal2[c(9, 10, 5, 6)],
  upstream = 1500,
  downstream = 1500
)
p4 <- plot_bw_profile(
  c(mocks_bigwigs, pds_bigwigs),
  peak_cats[[4]],
  labels = c("mock1", "mock2", "trt1", "rtr2"),
  mode = "center",
  show_error = T,
  verbose = F,
  remove_top = 0.001,
  colors = mypal2[c(9, 10, 5, 6)],
  upstream = 1500,
  downstream = 1500
)

ggarrange(p1, p2, p3, p4, ncol = 4, nrow = 1)
```

```{r fig.width=12, fig.height=3}
p1 <- plot_bw_profile(
  c(mocks_bigwigs, pds_bigwigs),
  peak_cats[[1]],
  labels = c("mock1", "mock2", "trt1", "rtr2"),
  mode = "center",
  show_error = T,
  verbose = F,
  remove_top = 0.001,
  colors = mypal2[c(9, 10, 3, 4)],
  upstream = 1500,
  downstream = 1500
)
p2 <- plot_bw_profile(
  c(mocks_bigwigs, pds_bigwigs),
  peak_cats[[2]],
  labels = c("mock1", "mock2", "trt1", "rtr2"),
  mode = "center",
  show_error = T,
  verbose = F,
  remove_top = 0.001,
  colors = mypal2[c(9, 10, 3, 4)],
  upstream = 1500,
  downstream = 1500
)
p3 <- plot_bw_profile(
  c(mocks_bigwigs, pds_bigwigs),
  peak_cats[[3]],
  labels = c("mock1", "mock2", "trt1", "rtr2"),
  mode = "center",
  show_error = T,
  verbose = F,
  remove_top = 0.001,
  colors = mypal2[c(9, 10, 3, 4)],
  upstream = 1500,
  downstream = 1500
)
p4 <- plot_bw_profile(
  c(mocks_bigwigs, pds_bigwigs),
  peak_cats[[4]],
  labels = c("mock1", "mock2", "trt1", "rtr2"),
  mode = "center",
  show_error = T,
  verbose = F,
  remove_top = 0.001,
  colors = mypal2[c(9, 10, 3, 4)],
  upstream = 1500,
  downstream = 1500
)

ggarrange(p1, p2, p3, p4, ncol = 4, nrow = 1)
```
### cen we learn something from the genes with top-most increase in CTCF?
```{r}
sigup.pds <- results[results$deseq.sigup.pds, ]
sigup.pdc <- results[results$deseq.sigup.pdc, ]

#export.bed(sigup.pds,"peaks_sigup_pds.bed")
#export.bed(sigup.pds,"peaks_sigup_pds.bed")
```


```{r}
gghistogram(results, "width", add = "median", fill = "class")
```

### Distance-based analysis

```{r}
G4_bed <- import('../data/G4_CnT/G4_WT_peaks.narrowPeak')
ATAC_bed <- import('../data/ATAC-seq/ATAC_seq_mESC_Martire_peaks.narrowPeak')
pro_bed <- import('../data/regions/promoters_geneSymbol.mm10.bed')

G4_bed <- bedscout::annotate_overlapping_features(G4_bed, pro_bed, name_field = "name")

G4_bed$name <- "G4"
G4_bed$name[!is.na(G4_bed$nearby_features)] <- "G4pro"
G4_bed$name[grepl("pro", G4_bed$name)] <- "G4pro"
```

```{r}
nearest_G4_1kb <- bedscout::annotate_nearby_features(
  ctcf,
  G4_bed,
  distance_cutoff = 1000,
  ignore.strand = T,
  name_field = "name"
)
nearest_G4_2.5kb <- bedscout::annotate_nearby_features(
  ctcf,
  G4_bed,
  distance_cutoff = 2500,
  ignore.strand = T,
  name_field = "name"
)
nearest_G4_5kb <- bedscout::annotate_nearby_features(
  ctcf,
  G4_bed,
  distance_cutoff = 5000,
  ignore.strand = T,
  name_field = "name"
)
nearest_G4_10kb <- bedscout::annotate_nearby_features(
  ctcf,
  G4_bed,
  distance_cutoff = 10000,
  ignore.strand = T,
  name_field = "name"
)
nearest_G4_50kb <- bedscout::annotate_nearby_features(
  ctcf,
  G4_bed,
  distance_cutoff = 50000,
  ignore.strand = T,
  name_field = "name"
)

ctcf$nearest_G4 <- factor(">50kb",
                          levels = c("<1kb", "<2.5kb", "<5kb", "<10kb", "<50kb", ">50kb"))
ctcf$nearest_G4[!is.na(nearest_G4_50kb$nearby_features)] <- "<50kb"
ctcf$nearest_G4[!is.na(nearest_G4_10kb$nearby_features)] <- "<10kb"
ctcf$nearest_G4[!is.na(nearest_G4_5kb$nearby_features)] <- "<5kb"
ctcf$nearest_G4[!is.na(nearest_G4_2.5kb$nearby_features)] <- "<2.5kb"
ctcf$nearest_G4[!is.na(nearest_G4_1kb$nearby_features)] <- "<1kb"

ctcf$nearest_G4_type <- "none"
ctcf$nearest_G4_type[!is.na(nearest_G4_50kb$nearby_features)] <- nearest_G4_50kb$nearby_features[!is.na(nearest_G4_50kb$nearby_features)]
ctcf$nearest_G4_type[!is.na(nearest_G4_10kb$nearby_features)] <- nearest_G4_10kb$nearby_features[!is.na(nearest_G4_10kb$nearby_features)]
ctcf$nearest_G4_type[!is.na(nearest_G4_5kb$nearby_features)] <- nearest_G4_5kb$nearby_features[!is.na(nearest_G4_5kb$nearby_features)]
ctcf$nearest_G4_type[!is.na(nearest_G4_2.5kb$nearby_features)] <- nearest_G4_2.5kb$nearby_features[!is.na(nearest_G4_2.5kb$nearby_features)]
ctcf$nearest_G4_type[!is.na(nearest_G4_1kb$nearby_features)] <- nearest_G4_1kb$nearby_features[!is.na(nearest_G4_1kb$nearby_features)]

ctcf$nearest_G4_type[grep("pro", ctcf$nearest_G4_type)] <- "G4pro"

results$nearest_G4 <- ctcf$nearest_G4
results$nearest_G4_type <- ctcf$nearest_G4_type
table(results$nearest_G4)
table(results$nearest_G4_type)
table(results$nearest_G4, results$nearest_G4_type)
```
```{r fig.width=3, fig.height=3}
ggviolin(
  results,
  x = "nearest_G4",
  y = "mean.mock",
  fill = mypal[1],
  add = "median_iqr"
) + coord_cartesian(ylim = c(0, 10)) +
  stat_compare_means(label.y = 8,
                     label.x = 3,
                     size = 3) +
  geom_hline(yintercept = 0, linetype = "dotted")
ggsave(glue("{plot_folder}Violin_CTCF_signal_by_G4distance.pdf"),
       plot = last_plot())
```

```{r fig.width=3, fig.height=3}
ggviolin(
  results,
  x = "nearest_G4",
  y = "deseq.lfc.pds",
  fill = mypal[1],
  add = "median_iqr"
) + coord_cartesian(ylim = c(-4, 4)) + geom_hline(yintercept = 0, linewidth = 0.2) +
  geom_hline(
    yintercept = median(results$deseq.lfc.pds[results$nearest_G4 == "<1kb"]),
    linetype = "dotted",
    linewidth = 0.2
  ) +
  stat_compare_means(label.y = 3,
                     label.x = 2,
                     size = 3)
ggsave(glue("{plot_folder}Violin_CTCF_PDS_lfc_by_G4distance.pdf"),
       plot = last_plot())
```
```{r}
nearest_G4_stats = compare_means(deseq.lfc.pds ~ nearest_G4, results)
nearest_G4_stats
write_tsv(nearest_G4_stats,
          glue("{stat_output}pds_G4distance_plots-statistics.tsv"))
```

```{r fig.width=3, fig.height=3}
ggviolin(
  results,
  x = "nearest_G4",
  y = "deseq.lfc.pdc",
  fill = mypal[1],
  add = "median_iqr"
) + coord_cartesian(ylim = c(-2, 2)) + geom_hline(yintercept = 0, linewidth = 0.2) + geom_hline(
  yintercept = mean(results$deseq.lfc.pdc[results$nearest_G4 == "<1kb"]),
  linetype = "dotted",
  linewidth = 0.2
)
ggsave(glue("{plot_folder}Violin_CTCF_PhenDC3_lfc_by_G4distance.pdf"), plot = last_plot())
```

```{r}
nearest_G4_stats = compare_means(deseq.lfc.pdc ~ nearest_G4, results)
nearest_G4_stats
write_tsv(nearest_G4_stats, glue("{stat_output}phendc_G4distance_plots-statistics.tsv"))
```
```{r fig.width=5, fig.height=3}
ggviolin(
  results,
  x = "nearest_G4",
  y = "deseq.lfc.pds",
  fill = "nearest_G4_type",
  palette = mypal[c(1, 3, 5)],
  add = "median_iqr"
) + coord_cartesian(ylim = c(-2, 4)) + geom_hline(yintercept = 0, linewidth = 0.2) + geom_hline(
  yintercept = mean(results$deseq.lfc.pds[results$nearest_G4 == "<1kb"]),
  linetype = "dotted",
  linewidth = 0.2
) +
  stat_compare_means(aes(group = nearest_G4_type))
ggsave(glue("{plot_folder}Violin_CTCF_PDS_lfc_by_G4distance_pro.pdf"),
       plot = last_plot())
```
```{r}
compare_means(deseq.lfc.pds ~ nearest_G4, results[results$nearest_G4_type ==
                                                    "G4", ])
compare_means(deseq.lfc.pds ~ nearest_G4, results[results$nearest_G4_type ==
                                                    "G4", ]) %>%
  write_tsv(.,
            glue(
              "{stat_output}pds_G4distance_plots-nearest_G4-statistics.tsv"
            ))
```
```{r}
compare_means(deseq.lfc.pds ~ nearest_G4, results[results$nearest_G4_type ==
                                                    "G4pro", ])
compare_means(deseq.lfc.pds ~ nearest_G4, results[results$nearest_G4_type ==
                                                    "G4pro", ]) %>%
  write_tsv(.,
            glue(
              "{stat_output}pds_G4distance_plots-nearest_G4pro-statistics.tsv"
            ))
```
```{r fig.width=5, fig.height=3}
ggviolin(
  results,
  x = "nearest_G4",
  y = "deseq.lfc.pdc",
  fill = "nearest_G4_type",
  palette = mypal[c(1, 3, 5)],
  add = "median_iqr"
) + coord_cartesian(ylim = c(-2, 2)) + geom_hline(yintercept = 0, linewidth = 0.2) + geom_hline(
  yintercept = mean(results$deseq.lfc.pdc[results$nearest_G4 == "<1kb"]),
  linetype = "dotted",
  linewidth = 0.2
)
ggsave(glue(
  "{plot_folder}Violin_CTCF_PhenDC3_lfc_by_G4distance_pro.pdf"
),
plot = last_plot())
```

```{r}
compare_means(deseq.lfc.pdc ~ nearest_G4, results[results$nearest_G4_type ==
                                                    "G4", ])
compare_means(deseq.lfc.pdc ~ nearest_G4, results[results$nearest_G4_type ==
                                                    "G4", ]) %>%
  write_tsv(.,
            glue(
              "{stat_output}phendc_G4distance_plots-nearest_G4-statistics.tsv"
            ))
```

```{r}
compare_means(deseq.lfc.pdc ~ nearest_G4, results[results$nearest_G4_type ==
                                                    "G4pro", ])
compare_means(deseq.lfc.pdc ~ nearest_G4, results[results$nearest_G4_type ==
                                                    "G4pro", ]) %>%
  write_tsv(.,
            glue(
              "{stat_output}phendc_G4distance_plots-nearest_G4pro-statistics.tsv"
            ))
```
### Distance-based analysis, this time with predicted G4 motifs, not experimental peaks

```{r}
#results <- read.table(glue("{result_folder}foldchange_results.txt"))
results$class <- factor(results$class, levels = c("CTCF_and_G4", "CTCF_not_G4"))
G4_bed <- import('../data/G4_CnT/G4_WT_peaks.narrowPeak')
G4_bed$name <- "peak"
G4pred_bed <- import('../data/predG4/mm10_canonical_G4_PQS-regex.bed')
ATAC_bed <- import('../data/ATAC-seq/ATAC_seq_mESC_Martire_peaks.narrowPeak')
pro_bed <- import('../data/regions/promoters_geneSymbol.mm10.bed')

G4pred_bed <- bedscout::annotate_overlapping_features(G4pred_bed, G4_bed, name_field = "name")

G4pred_bed$name <- "G4pred"
G4pred_bed$name[grepl("peak", G4pred_bed$nearby_features)] <- "G4exp"

table(G4pred_bed$name)
```

```{r}
nearest_G4_0kb <- bedscout::annotate_nearby_features(
  ctcf,
  G4pred_bed,
  distance_cutoff = 0,
  ignore.strand = T,
  name_field = "name"
)
nearest_G4_1kb <- bedscout::annotate_nearby_features(
  ctcf,
  G4pred_bed,
  distance_cutoff = 1000,
  ignore.strand = T,
  name_field = "name"
)
nearest_G4_2.5kb <- bedscout::annotate_nearby_features(
  ctcf,
  G4pred_bed,
  distance_cutoff = 2500,
  ignore.strand = T,
  name_field = "name"
)
nearest_G4_5kb <- bedscout::annotate_nearby_features(
  ctcf,
  G4pred_bed,
  distance_cutoff = 5000,
  ignore.strand = T,
  name_field = "name"
)
nearest_G4_10kb <- bedscout::annotate_nearby_features(
  ctcf,
  G4pred_bed,
  distance_cutoff = 10000,
  ignore.strand = T,
  name_field = "name"
)


ctcf$nearest_G4 <- factor(">10kb",
                          levels = c("0kb", "<1kb", "<2.5kb", "<5kb", "<10kb", ">10kb"))
ctcf$nearest_G4[!is.na(nearest_G4_10kb$nearby_features)] <- "<10kb"
ctcf$nearest_G4[!is.na(nearest_G4_5kb$nearby_features)] <- "<5kb"
ctcf$nearest_G4[!is.na(nearest_G4_2.5kb$nearby_features)] <- "<2.5kb"
ctcf$nearest_G4[!is.na(nearest_G4_1kb$nearby_features)] <- "<1kb"
ctcf$nearest_G4[!is.na(nearest_G4_0kb$nearby_features)] <- "0kb"

results$nearest_G4 <- ctcf$nearest_G4
table(results$nearest_G4)
table(results$nearest_G4, results$pro)
```


```{r fig.width=5, fig.height=2}
mdf <- reshape2::melt(table(results$nearest_G4, results$pro))
ggplot(mdf, aes(Var1, Var2, fill = value)) +
  geom_tile(show.legend = F) + geom_text(aes(label = value)) +
  scale_fill_gradient(low = "white", high = "orange") + theme_minimal()
ggsave(glue("{plot_folder}Heatmap_CTCF_sites_by_predG4distance.pdf"),
       plot = last_plot())
```

```{r fig.width=3, fig.height=3}
ggviolin(
  results,
  x = "nearest_G4",
  y = "mean.mock",
  fill = mypal[1],
  add = "median_iqr"
) + coord_cartesian(ylim = c(0, 10)) + geom_hline(yintercept = 0, linetype =
                                                    "dotted") +
  stat_compare_means(
    aes(group = nearest_G4),
    label.y = 8,
    label.x = 2,
    size = 3
  )
ggsave(glue(
  "{plot_folder}Violin_CTCF_signal_by_predG4distance.pdf"),
  plot = last_plot())
```

```{r fig.width=3, fig.height=3}
ggviolin(
  results,
  x = "nearest_G4",
  y = "deseq.lfc.pds",
  fill = mypal[1],
  add = "median_iqr"
) + coord_cartesian(ylim = c(-0.5, 1.5)) + geom_hline(yintercept = 0, linewidth = 0.2) + geom_hline(
  yintercept = median(results$deseq.lfc.pds[results$nearest_G4 == "0kb"]),
  linetype = "dotted",
  linewidth = 0.2
) +
  stat_compare_means(label.y = 1.3,
                     label.x = 3,
                     size = 2)
ggsave(glue("{plot_folder}Violin_CTCF_PDS_lfc_by_predG4distance.pdf"),
       plot = last_plot())
```
```{r}
compare_means(deseq.lfc.pds ~ nearest_G4, results)
```

```{r fig.width=3, fig.height=3}
ggviolin(
  results,
  x = "nearest_G4",
  y = "deseq.lfc.pdc",
  fill = mypal[1],
  add = "median_iqr"
) + coord_cartesian(ylim = c(-0.5, 1.5)) + geom_hline(yintercept = 0, linewidth = 0.2) + geom_hline(
  yintercept = mean(results$deseq.lfc.pdc[results$nearest_G4 == "0kb"]),
  linetype = "dotted",
  linewidth = 0.2
) +
  stat_compare_means(label.y = 1.3,
                     label.x = 3,
                     size = 2)
ggsave(glue(
  "{plot_folder}Violin_CTCF_PhenDC3_lfc_by_predG4distance.pdf"
),
plot = last_plot())
```

```{r}
compare_means(deseq.lfc.pdc ~ nearest_G4, results)

```
```{r fig.width=5, fig.height=3}
ggviolin(
  results,
  x = "nearest_G4",
  y = "deseq.lfc.pds",
  fill = "pro",
  palette = mypal[c(1, 3, 5)],
  add = "median_iqr"
) + coord_cartesian(ylim = c(-0.5, 1.5)) + geom_hline(yintercept = 0, linewidth = 0.2) + geom_hline(
  yintercept = mean(results$deseq.lfc.pds[results$nearest_G4 == "0kb"]),
  linetype = "dotted",
  linewidth = 0.2
) +
  stat_compare_means(aes(group = pro),
                     label.y = 1.3,
                     label.x = 3,
                     size = 1.5)
ggsave(glue("{plot_folder}Violin_CTCF_PDS_lfc_by_predG4distance_pro.pdf"), plot = last_plot())
```
```{
r
}
compare_means(deseq.lfc.pds ~ nearest_G4, results[results$pro == "noPro", ])
compare_means(deseq.lfc.pds ~ nearest_G4, results[results$pro == "noPro", ]) %>%
write_tsv(., "pds_G4distance_plots-noPro-statistics.tsv")
  
```
```{r}
compare_means(deseq.lfc.pds ~ nearest_G4, results[results$pro == "Pro", ]) %>%
  write_tsv(.,
            glue("{stat_output}pds_G4distance_plots-Pro-statistics.tsv"))

compare_means(deseq.lfc.pds ~ pro, group.by = "nearest_G4", results) %>%
  write_tsv(.,
            glue(
              "{stat_output}pds_G4distance_plots-between_prom_types_stats.tsv"
            ))

compare_means(deseq.lfc.pds ~ nearest_G4, group.by = "pro", results) %>%
  write_tsv(.,
            glue(
              "{stat_output}pds_G4distance_plots-within_prom_types_stats.tsv"
            ))

```
```{r fig.width=5, fig.height=3}
ggviolin(
  results,
  x = "nearest_G4",
  y = "deseq.lfc.pdc",
  fill = "pro",
  palette = mypal[c(1, 3, 5)],
  add = "median_iqr"
) + coord_cartesian(ylim = c(-0.5, 1.5)) + geom_hline(yintercept = 0, linewidth = 0.2) + geom_hline(
  yintercept = mean(results$deseq.lfc.pdc[results$nearest_G4 == "0kb"]),
  linetype = "dotted",
  linewidth = 0.2
) +
  stat_compare_means(aes(group = pro),
                     label.y = 1.3,
                     label.x = 3,
                     size = 1.5)
ggsave(glue(
  "{plot_folder}Violin_CTCF_PhenDC3_lfc_by_predG4distance_pro.pdf"
),
plot = last_plot())
```

```{r}
compare_means(deseq.lfc.pdc ~ nearest_G4, results[results$pro == "noPro", ]) %>%
  write_tsv(.,
            glue(
              "{stat_output}phendc_G4distance_plots-noPro-statistics.tsv"
            ))

compare_means(deseq.lfc.pdc ~ pro, group.by = "nearest_G4", results) %>%
  write_tsv(.,
            glue(
              "{stat_output}phendc_G4distance_plots-between_prom_types_stats.tsv"
            ))

compare_means(deseq.lfc.pdc ~ nearest_G4, group.by = "pro", results) %>%
  write_tsv(.,
            glue(
              "{stat_output}phendc_G4distance_plots-within_prom_types_stats.tsv"
            ))


```
